padding 50257 -> 50304 vocab_size, the nerest multiple of 64. the biggest deal smallest optimization i've made in recent past, about 25% faster. this is because the last layer is a major latency bottleneck consuming about 40% of latency due to the very high channel count.

2026-05-28 08:12:06 +00:00 · 2023-02-04 16:06:18 +00:00
parent b3c17c6c6a
commit 77e7e04c26
3 changed files with 5 additions and 5 deletions
@@ -43,8 +43,8 @@ if real_data:
        return x, y
 else:
    # alternatively, if fixed data is desired to not care about data loading
-    x = torch.randint(50257, (batch_size, block_size), device=device)
-    y = torch.randint(50257, (batch_size, block_size), device=device)
+    x = torch.randint(50304, (batch_size, block_size), device=device)
+    y = torch.randint(50304, (batch_size, block_size), device=device)
    get_batch = lambda split: (x, y)

 # model init
@@ -115,7 +115,7 @@ class Block(nn.Module):
@dataclass
 class GPTConfig:
    block_size: int = 1024
-    vocab_size: int = 50257
+    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
@@ -128,8 +128,8 @@ if os.path.exists(meta_path):
    vocab_size = meta['vocab_size']
    print(f"vocab_size = {vocab_size} (from {meta_path})")
 else:
-    print(f"vocab_size not found in {meta_path}, using GPT-2 default of 50257")
-    vocab_size = 50257
+    print(f"vocab_size not found in {meta_path}, using GPT-2 default of 50257 (rounded up to 50304 for efficiency)")
+    vocab_size = 50304

 # model init
 model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,