1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-13 05:19:58 +00:00

better hyperparams for gpt2 124M model on A100 40GB. still uncertain about max_iters especially, and a bit about weight decay, betas

This commit is contained in:
Andrej Karpathy 2023-01-03 17:45:49 +00:00
parent b45eec3e4b
commit 9f95aca93e

View File

@ -27,11 +27,11 @@ from model import GPTConfig, GPT
# default config values # default config values
# I/O # I/O
out_dir = 'out' out_dir = 'out'
eval_interval = 500 eval_interval = 2000
log_interval = 1 log_interval = 1
eval_iters = 50 eval_iters = 200
eval_only = False # if True, script exits right after the first eval eval_only = False # if True, script exits right after the first eval
always_save_checkpoint = False # if True, always save a checkpoint after each eval always_save_checkpoint = True # if True, always save a checkpoint after each eval
# wandb logging # wandb logging
wandb_log = False # disabled by default wandb_log = False # disabled by default
wandb_entity = 'karpathy' wandb_entity = 'karpathy'
@ -39,25 +39,25 @@ wandb_project = 'owt'
wandb_run_name = 'gpt2' # 'run' + str(time.time()) wandb_run_name = 'gpt2' # 'run' + str(time.time())
# data # data
dataset = 'openwebtext' dataset = 'openwebtext'
batch_size = 8 batch_size = 12
block_size = 1024 block_size = 1024
# model # model
device = 'cuda:0' device = 'cuda:0'
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*' init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
dropout = 0.1 dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
n_layer = 12 n_layer = 12
n_head = 12 n_head = 12
n_embd = 768 n_embd = 768
# adamw optimizer # adamw optimizer
learning_rate = 2.5e-4 # max learning rate learning_rate = 6e-4 # max learning rate
max_iters = 500000 # total number of training iterations max_iters = 400000 # total number of training iterations
weight_decay = 1e-2 weight_decay = 1e-2
betas = (0.9, 0.95) betas = (0.9, 0.95)
# learning rate decay settings # learning rate decay settings
decay_lr = True # whether to decay the learning rate decay_lr = True # whether to decay the learning rate
warmup_iters = 2000 # how many steps to warm up for warmup_iters = 2000 # how many steps to warm up for
lr_decay_iters = 320000 # how many steps to decay the learning rate for lr_decay_iters = 400000 # should be ~= max_iters per Chinchilla
min_lr = 1e-5 # minimum learning rate min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
# DDP settings # DDP settings
backend = 'nccl' # 'nccl', 'gloo', etc. backend = 'nccl' # 'nccl', 'gloo', etc.
compile = True # use PyTorch 2.0 to compile the model to be faster compile = True # use PyTorch 2.0 to compile the model to be faster