based on my experiments these biases are indeed not needed. code runs faster, identical results. keeping the option just because it deviates from the gpt-2 setup

2025-09-04 20:07:58 +00:00 · 2023-01-30 08:07:58 +00:00
parent 001c1e7be7
commit 0e90ee9d48
1 changed files with 1 additions and 1 deletions
--- a/train.py
+++ b/train.py
@@ -53,7 +53,7 @@ n_layer = 12
 n_head = 12
 n_embd = 768
 dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
-bias = True # do we use bias inside LayerNorm and Linear layers?
+bias = False # do we use bias inside LayerNorm and Linear layers?
 # adamw optimizer
 learning_rate = 6e-4 # max learning rate
 max_iters = 600000 # total number of training iterations