1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2025-10-24 03:57:37 +00:00

small tweaks, make default WD be 0.1 as is often cited, and remove spurious init of LayerNorm, which is already initialized at 1,0

This commit is contained in:
Andrej Karpathy
2023-02-06 23:07:25 +00:00
parent ab21d6c15d
commit 8b1e43209e
2 changed files with 1 additions and 5 deletions

View File

@@ -173,10 +173,6 @@ class GPT(nn.Module):
torch.nn.init.zeros_(module.bias) torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding): elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
elif isinstance(module, (LayerNorm, nn.LayerNorm)):
torch.nn.init.ones_(module.weight)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
def forward(self, idx, targets=None): def forward(self, idx, targets=None):
device = idx.device device = idx.device

View File

@@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers?
# adamw optimizer # adamw optimizer
learning_rate = 6e-4 # max learning rate learning_rate = 6e-4 # max learning rate
max_iters = 600000 # total number of training iterations max_iters = 600000 # total number of training iterations
weight_decay = 1e-2 weight_decay = 1e-1
beta1 = 0.9 beta1 = 0.9
beta2 = 0.95 beta2 = 0.95
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0 grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0