mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
based on my experiments these biases are indeed not needed. code runs faster, identical results. keeping the option just because it deviates from the gpt-2 setup
This commit is contained in:
parent
001c1e7be7
commit
0e90ee9d48
2
train.py
2
train.py
@ -53,7 +53,7 @@ n_layer = 12
|
||||
n_head = 12
|
||||
n_embd = 768
|
||||
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
|
||||
bias = True # do we use bias inside LayerNorm and Linear layers?
|
||||
bias = False # do we use bias inside LayerNorm and Linear layers?
|
||||
# adamw optimizer
|
||||
learning_rate = 6e-4 # max learning rate
|
||||
max_iters = 600000 # total number of training iterations
|
||||
|
Loading…
Reference in New Issue
Block a user