mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 06:00:29 +00:00
better hyperparams for gpt2 124M model on A100 40GB. still uncertain about max_iters especially, and a bit about weight decay, betas
This commit is contained in:
parent
b45eec3e4b
commit
9f95aca93e
18
train.py
18
train.py
@ -27,11 +27,11 @@ from model import GPTConfig, GPT
|
||||
# default config values
|
||||
# I/O
|
||||
out_dir = 'out'
|
||||
eval_interval = 500
|
||||
eval_interval = 2000
|
||||
log_interval = 1
|
||||
eval_iters = 50
|
||||
eval_iters = 200
|
||||
eval_only = False # if True, script exits right after the first eval
|
||||
always_save_checkpoint = False # if True, always save a checkpoint after each eval
|
||||
always_save_checkpoint = True # if True, always save a checkpoint after each eval
|
||||
# wandb logging
|
||||
wandb_log = False # disabled by default
|
||||
wandb_entity = 'karpathy'
|
||||
@ -39,25 +39,25 @@ wandb_project = 'owt'
|
||||
wandb_run_name = 'gpt2' # 'run' + str(time.time())
|
||||
# data
|
||||
dataset = 'openwebtext'
|
||||
batch_size = 8
|
||||
batch_size = 12
|
||||
block_size = 1024
|
||||
# model
|
||||
device = 'cuda:0'
|
||||
init_from = 'scratch' # 'scratch' or 'resume' or 'gpt2*'
|
||||
dropout = 0.1
|
||||
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
|
||||
n_layer = 12
|
||||
n_head = 12
|
||||
n_embd = 768
|
||||
# adamw optimizer
|
||||
learning_rate = 2.5e-4 # max learning rate
|
||||
max_iters = 500000 # total number of training iterations
|
||||
learning_rate = 6e-4 # max learning rate
|
||||
max_iters = 400000 # total number of training iterations
|
||||
weight_decay = 1e-2
|
||||
betas = (0.9, 0.95)
|
||||
# learning rate decay settings
|
||||
decay_lr = True # whether to decay the learning rate
|
||||
warmup_iters = 2000 # how many steps to warm up for
|
||||
lr_decay_iters = 320000 # how many steps to decay the learning rate for
|
||||
min_lr = 1e-5 # minimum learning rate
|
||||
lr_decay_iters = 400000 # should be ~= max_iters per Chinchilla
|
||||
min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
|
||||
# DDP settings
|
||||
backend = 'nccl' # 'nccl', 'gloo', etc.
|
||||
compile = True # use PyTorch 2.0 to compile the model to be faster
|
||||
|
Loading…
Reference in New Issue
Block a user