mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2025-10-20 01:57:39 +00:00
small tweaks, make default WD be 0.1 as is often cited, and remove spurious init of LayerNorm, which is already initialized at 1,0
This commit is contained in:
2
train.py
2
train.py
@@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers?
|
||||
# adamw optimizer
|
||||
learning_rate = 6e-4 # max learning rate
|
||||
max_iters = 600000 # total number of training iterations
|
||||
weight_decay = 1e-2
|
||||
weight_decay = 1e-1
|
||||
beta1 = 0.9
|
||||
beta2 = 0.95
|
||||
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
||||
|
Reference in New Issue
Block a user