1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-10 20:09:58 +00:00

tune the hyperparams a bit, in configs

This commit is contained in:
Andrej Karpathy 2023-02-05 19:31:18 +00:00
parent ab0718a7dd
commit fce706cbe6
2 changed files with 16 additions and 13 deletions

View File

@ -1,22 +1,25 @@
import time import time
out_dir = 'out-shakespeare' out_dir = 'out-shakespeare'
eval_interval = 200 eval_interval = 5
eval_iters = 40
wandb_log = False # feel free to turn on wandb_log = False # feel free to turn on
wandb_project = 'shakespeare' wandb_project = 'shakespeare'
wandb_run_name = 'ft-' + str(time.time()) wandb_run_name = 'ft-' + str(time.time())
compile = False # takes too little time to finetune, not worth it
# save a nice and overfit checkpoint that
# will only speak Shakespeare and forgets
# everything else about the world #dark
always_save_checkpoint = True
dataset = 'shakespeare' dataset = 'shakespeare'
init_from = 'gpt2-xl' init_from = 'gpt2-xl' # this is the largest GPT-2 model
batch_size = 1
block_size = 512
learning_rate = 1e-5 # only save checkpoints if the validation loss improves
max_iters = 1000 always_save_checkpoint = False
# the number of examples per iter:
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
batch_size = 1
gradient_accumulation_steps = 32
max_iters = 20
# finetune at constant LR
learning_rate = 3e-5
decay_lr = False decay_lr = False

View File

@ -15,7 +15,7 @@ wandb_run_name = 'mini-gpt'
dataset = 'shakespeare_char' dataset = 'shakespeare_char'
batch_size = 64 batch_size = 64
block_size = 256 # context of up to 128 previous characters block_size = 256 # context of up to 256 previous characters
# baby GPT model :) # baby GPT model :)
n_layer = 6 n_layer = 6