mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2025-10-27 05:17:41 +00:00
bunch of plumbing of bias all around. measuring bias=False to be about 6% faster
This commit is contained in:
5
train.py
5
train.py
@@ -53,6 +53,7 @@ n_layer = 12
|
||||
n_head = 12
|
||||
n_embd = 768
|
||||
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
|
||||
bias = False # do we use bias inside LayerNorm and Linear layers?
|
||||
# adamw optimizer
|
||||
learning_rate = 6e-4 # max learning rate
|
||||
max_iters = 600000 # total number of training iterations
|
||||
@@ -129,7 +130,8 @@ else:
|
||||
vocab_size = 50257
|
||||
|
||||
# model init
|
||||
model_args = dict(n_layer = n_layer, n_head = n_head, n_embd = n_embd, block_size = block_size, dropout = dropout, vocab_size = vocab_size)
|
||||
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
|
||||
dropout=dropout, vocab_size=vocab_size, bias=bias)
|
||||
if init_from == 'scratch':
|
||||
# init a new model from scratch
|
||||
print("Initializing a new model from scratch")
|
||||
@@ -158,6 +160,7 @@ elif init_from == 'resume':
|
||||
best_val_loss = checkpoint['best_val_loss']
|
||||
elif init_from.startswith('gpt2'):
|
||||
print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
|
||||
assert bias, "GPT-2 models have bias, so we can't use bias=False"
|
||||
# initialize from OpenAI GPT-2 weights
|
||||
override_args = dict(dropout=dropout)
|
||||
model = GPT.from_pretrained(init_from, override_args)
|
||||
|
||||
Reference in New Issue
Block a user