small tweaks, make default WD be 0.1 as is often cited, and remove spurious init of LayerNorm, which is already initialized at 1,0

2025-10-20 01:57:39 +00:00 · 2023-02-06 23:07:25 +00:00
parent ab21d6c15d
commit 8b1e43209e
2 changed files with 1 additions and 5 deletions
--- a/train.py
+++ b/train.py
@@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers?
 # adamw optimizer
 learning_rate = 6e-4 # max learning rate
 max_iters = 600000 # total number of training iterations
-weight_decay = 1e-2
+weight_decay = 1e-1
 beta1 = 0.9
 beta2 = 0.95
 grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0