diff --git a/train.py b/train.py index 778e1a3..e849d88 100644 --- a/train.py +++ b/train.py @@ -27,11 +27,11 @@ from model import GPTConfig, GPT # default config values # I/O out_dir = 'out' -eval_interval = 500 +eval_interval = 2000 log_interval = 1 -eval_iters = 50 +eval_iters = 200 eval_only = False # if True, script exits right after the first eval -always_save_checkpoint = False # if True, always save a checkpoint after each eval +always_save_checkpoint = True # if True, always save a checkpoint after each eval # wandb logging wandb_log = False # disabled by default wandb_entity = 'karpathy' @@ -39,25 +39,25 @@ wandb_project = 'owt' wandb_run_name = 'gpt2' # 'run' + str(time.time()) # data dataset = 'openwebtext' -batch_size = 8 +batch_size = 12 block_size = 1024 # model device = 'cuda:0' init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' -dropout = 0.1 +dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ n_layer = 12 n_head = 12 n_embd = 768 # adamw optimizer -learning_rate = 2.5e-4 # max learning rate -max_iters = 500000 # total number of training iterations +learning_rate = 6e-4 # max learning rate +max_iters = 400000 # total number of training iterations weight_decay = 1e-2 betas = (0.9, 0.95) # learning rate decay settings decay_lr = True # whether to decay the learning rate warmup_iters = 2000 # how many steps to warm up for -lr_decay_iters = 320000 # how many steps to decay the learning rate for -min_lr = 1e-5 # minimum learning rate +lr_decay_iters = 400000 # should be ~= max_iters per Chinchilla +min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla # DDP settings backend = 'nccl' # 'nccl', 'gloo', etc. compile = True # use PyTorch 2.0 to compile the model to be faster