From fce706cbe6f7e01cc7fbfac30eaa0ad2c30d56de Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 5 Feb 2023 19:31:18 +0000 Subject: [PATCH] tune the hyperparams a bit, in configs --- config/finetune_shakespeare.py | 27 +++++++++++++++------------ config/train_shakespeare_char.py | 2 +- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/config/finetune_shakespeare.py b/config/finetune_shakespeare.py index eb6545e..148a4c4 100644 --- a/config/finetune_shakespeare.py +++ b/config/finetune_shakespeare.py @@ -1,22 +1,25 @@ import time out_dir = 'out-shakespeare' -eval_interval = 200 +eval_interval = 5 +eval_iters = 40 wandb_log = False # feel free to turn on wandb_project = 'shakespeare' wandb_run_name = 'ft-' + str(time.time()) -compile = False # takes too little time to finetune, not worth it - -# save a nice and overfit checkpoint that -# will only speak Shakespeare and forgets -# everything else about the world #dark -always_save_checkpoint = True dataset = 'shakespeare' -init_from = 'gpt2-xl' -batch_size = 1 -block_size = 512 +init_from = 'gpt2-xl' # this is the largest GPT-2 model -learning_rate = 1e-5 -max_iters = 1000 +# only save checkpoints if the validation loss improves +always_save_checkpoint = False + +# the number of examples per iter: +# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter +# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters +batch_size = 1 +gradient_accumulation_steps = 32 +max_iters = 20 + +# finetune at constant LR +learning_rate = 3e-5 decay_lr = False diff --git a/config/train_shakespeare_char.py b/config/train_shakespeare_char.py index c50e4dd..cb0d333 100644 --- a/config/train_shakespeare_char.py +++ b/config/train_shakespeare_char.py @@ -15,7 +15,7 @@ wandb_run_name = 'mini-gpt' dataset = 'shakespeare_char' batch_size = 64 -block_size = 256 # context of up to 128 previous characters +block_size = 256 # context of up to 256 previous characters # baby GPT model :) n_layer = 6