mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
small tweaks, make default WD be 0.1 as is often cited, and remove spurious init of LayerNorm, which is already initialized at 1,0
This commit is contained in:
parent
ab21d6c15d
commit
8b1e43209e
4
model.py
4
model.py
@ -173,10 +173,6 @@ class GPT(nn.Module):
|
|||||||
torch.nn.init.zeros_(module.bias)
|
torch.nn.init.zeros_(module.bias)
|
||||||
elif isinstance(module, nn.Embedding):
|
elif isinstance(module, nn.Embedding):
|
||||||
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
||||||
elif isinstance(module, (LayerNorm, nn.LayerNorm)):
|
|
||||||
torch.nn.init.ones_(module.weight)
|
|
||||||
if module.bias is not None:
|
|
||||||
torch.nn.init.zeros_(module.bias)
|
|
||||||
|
|
||||||
def forward(self, idx, targets=None):
|
def forward(self, idx, targets=None):
|
||||||
device = idx.device
|
device = idx.device
|
||||||
|
2
train.py
2
train.py
@ -57,7 +57,7 @@ bias = False # do we use bias inside LayerNorm and Linear layers?
|
|||||||
# adamw optimizer
|
# adamw optimizer
|
||||||
learning_rate = 6e-4 # max learning rate
|
learning_rate = 6e-4 # max learning rate
|
||||||
max_iters = 600000 # total number of training iterations
|
max_iters = 600000 # total number of training iterations
|
||||||
weight_decay = 1e-2
|
weight_decay = 1e-1
|
||||||
beta1 = 0.9
|
beta1 = 0.9
|
||||||
beta2 = 0.95
|
beta2 = 0.95
|
||||||
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
grad_clip = 1.0 # clip gradients at this value, or disable if == 0.0
|
||||||
|
Loading…
Reference in New Issue
Block a user