mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
based on my experiments these biases are indeed not needed. code runs faster, identical results. keeping the option just because it deviates from the gpt-2 setup
This commit is contained in:
parent
001c1e7be7
commit
0e90ee9d48
2
train.py
2
train.py
@ -53,7 +53,7 @@ n_layer = 12
|
|||||||
n_head = 12
|
n_head = 12
|
||||||
n_embd = 768
|
n_embd = 768
|
||||||
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
|
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
|
||||||
bias = True # do we use bias inside LayerNorm and Linear layers?
|
bias = False # do we use bias inside LayerNorm and Linear layers?
|
||||||
# adamw optimizer
|
# adamw optimizer
|
||||||
learning_rate = 6e-4 # max learning rate
|
learning_rate = 6e-4 # max learning rate
|
||||||
max_iters = 600000 # total number of training iterations
|
max_iters = 600000 # total number of training iterations
|
||||||
|
Loading…
Reference in New Issue
Block a user