From 8b1e43209e4da8162c05531e7bf46ac6d8d6773f Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 6 Feb 2023 23:07:25 +0000 Subject: [PATCH] small tweaks, make default WD be 0.1 as is often cited, and remove spurious init of LayerNorm, which is already initialized at 1,0 --- model.py | 4 ---- train.py | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/model.py b/model.py index 5a7f4dd..1b32cdf 100644 --- a/model.py +++ b/model.py @@ -173,10 +173,6 @@ class GPT(nn.Module): torch.nn.init.zeros_(module.bias) elif isinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) - elif isinstance(module, (LayerNorm, nn.LayerNorm)): - torch.nn.init.ones_(module.weight) - if module.bias is not None: - torch.nn.init.zeros_(module.bias) def forward(self, idx, targets=None): device = idx.device diff --git a/train.py b/train.py index 93aff4f..d932ce0 100644 --- a/train.py +++ b/train.py @@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers? # adamw optimizer learning_rate = 6e-4 # max learning rate max_iters = 600000 # total number of training iterations -weight_decay = 1e-2 +weight_decay = 1e-1 beta1 = 0.9 beta2 = 0.95 grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0