mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
padding 50257 -> 50304 vocab_size, the nerest multiple of 64. the biggest deal smallest optimization i've made in recent past, about 25% faster. this is because the last layer is a major latency bottleneck consuming about 40% of latency due to the very high channel count.
This commit is contained in:
parent
b3c17c6c6a
commit
77e7e04c26
4
bench.py
4
bench.py
@ -43,8 +43,8 @@ if real_data:
|
|||||||
return x, y
|
return x, y
|
||||||
else:
|
else:
|
||||||
# alternatively, if fixed data is desired to not care about data loading
|
# alternatively, if fixed data is desired to not care about data loading
|
||||||
x = torch.randint(50257, (batch_size, block_size), device=device)
|
x = torch.randint(50304, (batch_size, block_size), device=device)
|
||||||
y = torch.randint(50257, (batch_size, block_size), device=device)
|
y = torch.randint(50304, (batch_size, block_size), device=device)
|
||||||
get_batch = lambda split: (x, y)
|
get_batch = lambda split: (x, y)
|
||||||
|
|
||||||
# model init
|
# model init
|
||||||
|
2
model.py
2
model.py
@ -115,7 +115,7 @@ class Block(nn.Module):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class GPTConfig:
|
class GPTConfig:
|
||||||
block_size: int = 1024
|
block_size: int = 1024
|
||||||
vocab_size: int = 50257
|
vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
|
||||||
n_layer: int = 12
|
n_layer: int = 12
|
||||||
n_head: int = 12
|
n_head: int = 12
|
||||||
n_embd: int = 768
|
n_embd: int = 768
|
||||||
|
4
train.py
4
train.py
@ -128,8 +128,8 @@ if os.path.exists(meta_path):
|
|||||||
vocab_size = meta['vocab_size']
|
vocab_size = meta['vocab_size']
|
||||||
print(f"vocab_size = {vocab_size} (from {meta_path})")
|
print(f"vocab_size = {vocab_size} (from {meta_path})")
|
||||||
else:
|
else:
|
||||||
print(f"vocab_size not found in {meta_path}, using GPT-2 default of 50257")
|
print(f"vocab_size not found in {meta_path}, using GPT-2 default of 50257 (rounded up to 50304 for efficiency)")
|
||||||
vocab_size = 50257
|
vocab_size = 50304
|
||||||
|
|
||||||
# model init
|
# model init
|
||||||
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
|
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
|
||||||
|
Loading…
Reference in New Issue
Block a user