mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-13 05:19:58 +00:00
small tweaks, make default WD be 0.1 as is often cited, and remove spurious init of LayerNorm, which is already initialized at 1,0
This commit is contained in:
parent
ab21d6c15d
commit
8b1e43209e
4
model.py
4
model.py
@ -173,10 +173,6 @@ class GPT(nn.Module):
|
||||
torch.nn.init.zeros_(module.bias)
|
||||
elif isinstance(module, nn.Embedding):
|
||||
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
||||
elif isinstance(module, (LayerNorm, nn.LayerNorm)):
|
||||
torch.nn.init.ones_(module.weight)
|
||||
if module.bias is not None:
|
||||
torch.nn.init.zeros_(module.bias)
|
||||
|
||||
def forward(self, idx, targets=None):
|
||||
device = idx.device
|
||||
|
2
train.py
2
train.py
@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers?
|
||||
# adamw optimizer
|
||||
learning_rate = 6e-4 # max learning rate
|
||||
max_iters = 600000 # total number of training iterations
|
||||
weight_decay = 1e-2
|
||||
weight_decay = 1e-1
|
||||
beta1 = 0.9
|
||||
beta2 = 0.95
|
||||
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
||||
|
Loading…
Reference in New Issue
Block a user