2023-01-01 01:29:48 +00:00
|
|
|
import time
|
|
|
|
|
|
|
|
out_dir = 'out-shakespeare'
|
2023-02-05 19:31:18 +00:00
|
|
|
eval_interval = 5
|
|
|
|
eval_iters = 40
|
2023-01-01 01:29:48 +00:00
|
|
|
wandb_log = False # feel free to turn on
|
|
|
|
wandb_project = 'shakespeare'
|
|
|
|
wandb_run_name = 'ft-' + str(time.time())
|
|
|
|
|
|
|
|
dataset = 'shakespeare'
|
2023-02-05 19:31:18 +00:00
|
|
|
init_from = 'gpt2-xl' # this is the largest GPT-2 model
|
|
|
|
|
|
|
|
# only save checkpoints if the validation loss improves
|
|
|
|
always_save_checkpoint = False
|
|
|
|
|
|
|
|
# the number of examples per iter:
|
|
|
|
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter
|
|
|
|
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters
|
2023-01-01 01:29:48 +00:00
|
|
|
batch_size = 1
|
2023-02-05 19:31:18 +00:00
|
|
|
gradient_accumulation_steps = 32
|
|
|
|
max_iters = 20
|
2023-01-01 01:29:48 +00:00
|
|
|
|
2023-02-05 19:31:18 +00:00
|
|
|
# finetune at constant LR
|
|
|
|
learning_rate = 3e-5
|
2023-01-01 01:29:48 +00:00
|
|
|
decay_lr = False
|