mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
tune the hyperparams a bit, in configs
This commit is contained in:
parent
ab0718a7dd
commit
fce706cbe6
@ -1,22 +1,25 @@
|
|||||||
import time
|
import time
|
||||||
|
|
||||||
out_dir = 'out-shakespeare'
|
out_dir = 'out-shakespeare'
|
||||||
eval_interval = 200
|
eval_interval = 5
|
||||||
|
eval_iters = 40
|
||||||
wandb_log = False # feel free to turn on
|
wandb_log = False # feel free to turn on
|
||||||
wandb_project = 'shakespeare'
|
wandb_project = 'shakespeare'
|
||||||
wandb_run_name = 'ft-' + str(time.time())
|
wandb_run_name = 'ft-' + str(time.time())
|
||||||
compile = False # takes too little time to finetune, not worth it
|
|
||||||
|
|
||||||
# save a nice and overfit checkpoint that
|
|
||||||
# will only speak Shakespeare and forgets
|
|
||||||
# everything else about the world #dark
|
|
||||||
always_save_checkpoint = True
|
|
||||||
|
|
||||||
dataset = 'shakespeare'
|
dataset = 'shakespeare'
|
||||||
init_from = 'gpt2-xl'
|
init_from = 'gpt2-xl' # this is the largest GPT-2 model
|
||||||
batch_size = 1
|
|
||||||
block_size = 512
|
|
||||||
|
|
||||||
learning_rate = 1e-5
|
# only save checkpoints if the validation loss improves
|
||||||
max_iters = 1000
|
always_save_checkpoint = False
|
||||||
|
|
||||||
|
# the number of examples per iter:
|
||||||
|
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
|
||||||
|
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
|
||||||
|
batch_size = 1
|
||||||
|
gradient_accumulation_steps = 32
|
||||||
|
max_iters = 20
|
||||||
|
|
||||||
|
# finetune at constant LR
|
||||||
|
learning_rate = 3e-5
|
||||||
decay_lr = False
|
decay_lr = False
|
||||||
|
@ -15,7 +15,7 @@ wandb_run_name = 'mini-gpt'
|
|||||||
|
|
||||||
dataset = 'shakespeare_char'
|
dataset = 'shakespeare_char'
|
||||||
batch_size = 64
|
batch_size = 64
|
||||||
block_size = 256 # context of up to 128 previous characters
|
block_size = 256 # context of up to 256 previous characters
|
||||||
|
|
||||||
# baby GPT model :)
|
# baby GPT model :)
|
||||||
n_layer = 6
|
n_layer = 6
|
||||||
|
Loading…
Reference in New Issue
Block a user