diff --git a/model.py b/model.py index 5a7f4dd..1b32cdf 100644 --- a/model.py +++ b/model.py @@ -173,10 +173,6 @@ class GPT(nn.Module): torch.nn.init.zeros_(module.bias) elif isinstance(module, nn.Embedding): torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) - elif isinstance(module, (LayerNorm, nn.LayerNorm)): - torch.nn.init.ones_(module.weight) - if module.bias is not None: - torch.nn.init.zeros_(module.bias) def forward(self, idx, targets=None): device = idx.device diff --git a/train.py b/train.py index 93aff4f..d932ce0 100644 --- a/train.py +++ b/train.py @@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers? # adamw optimizer learning_rate = 6e-4 # max learning rate max_iters = 600000 # total number of training iterations -weight_decay = 1e-2 +weight_decay = 1e-1 beta1 = 0.9 beta2 = 0.95 grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0