From 8b1e43209e4da8162c05531e7bf46ac6d8d6773f Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Mon, 6 Feb 2023 23:07:25 +0000
Subject: [PATCH] small tweaks, make default WD be 0.1 as is often cited, and
 remove spurious init of LayerNorm, which is already initialized at 1,0

---
 model.py | 4 ----
 train.py | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/model.py b/model.py
index 5a7f4dd..1b32cdf 100644
--- a/model.py
+++ b/model.py
@@ -173,10 +173,6 @@ class GPT(nn.Module):
                 torch.nn.init.zeros_(module.bias)
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-        elif isinstance(module, (LayerNorm, nn.LayerNorm)):
-            torch.nn.init.ones_(module.weight)
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
 
     def forward(self, idx, targets=None):
         device = idx.device
diff --git a/train.py b/train.py
index 93aff4f..d932ce0 100644
--- a/train.py
+++ b/train.py
@@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers?
 # adamw optimizer
 learning_rate = 6e-4 # max learning rate
 max_iters = 600000 # total number of training iterations
-weight_decay = 1e-2
+weight_decay = 1e-1
 beta1 = 0.9
 beta2 = 0.95
 grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0