From 77e7e04c2657846ddf30c1ca2dd9f7cbb93ddeab Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 4 Feb 2023 16:06:18 +0000 Subject: [PATCH] padding 50257 -> 50304 vocab_size, the nerest multiple of 64. the biggest deal smallest optimization i've made in recent past, about 25% faster. this is because the last layer is a major latency bottleneck consuming about 40% of latency due to the very high channel count. --- bench.py | 4 ++-- model.py | 2 +- train.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bench.py b/bench.py index 294c824..f234bfb 100644 --- a/bench.py +++ b/bench.py @@ -43,8 +43,8 @@ if real_data: return x, y else: # alternatively, if fixed data is desired to not care about data loading - x = torch.randint(50257, (batch_size, block_size), device=device) - y = torch.randint(50257, (batch_size, block_size), device=device) + x = torch.randint(50304, (batch_size, block_size), device=device) + y = torch.randint(50304, (batch_size, block_size), device=device) get_batch = lambda split: (x, y) # model init diff --git a/model.py b/model.py index f934ef1..40266c2 100644 --- a/model.py +++ b/model.py @@ -115,7 +115,7 @@ class Block(nn.Module): @dataclass class GPTConfig: block_size: int = 1024 - vocab_size: int = 50257 + vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency n_layer: int = 12 n_head: int = 12 n_embd: int = 768 diff --git a/train.py b/train.py index 895ba94..6994d65 100644 --- a/train.py +++ b/train.py @@ -128,8 +128,8 @@ if os.path.exists(meta_path): vocab_size = meta['vocab_size'] print(f"vocab_size = {vocab_size} (from {meta_path})") else: - print(f"vocab_size not found in {meta_path}, using GPT-2 default of 50257") - vocab_size = 50257 + print(f"vocab_size not found in {meta_path}, using GPT-2 default of 50257 (rounded up to 50304 for efficiency)") + vocab_size = 50304 # model init model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,