nanogpt-experiments/config/train_gpt2.py

# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB
# launch as the following (e.g. in a screen session) and wait ~5 days:
# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py

wandb_log = True
wandb_project = 'owt'
wandb_run_name='gpt2-124M'

# these make the total batch size be ~0.5M
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
batch_size = 12
block_size = 1024
gradient_accumulation_steps = 5

# this makes total number of tokens be 300B
max_iters = 600000
lr_decay_iters = 600000

# eval stuff
eval_interval = 1000
eval_iters = 200
log_interval = 10

# weight decay
weight_decay = 1e-1
add gpt2 training config 2023-02-03 22:14:37 +00:00			`# config for training GPT-2 (124M) down to very nice loss of ~2.85 on 1 node of 8X A100 40GB`
include launch command too. anyone should be able to do this now 2023-02-03 22:17:05 +00:00			`# launch as the following (e.g. in a screen session) and wait ~5 days:`
			`# $ torchrun --standalone --nproc_per_node=8 train.py config/train_gpt2.py`
add gpt2 training config 2023-02-03 22:14:37 +00:00
			`wandb_log = True`
			`wandb_project = 'owt'`
			`wandb_run_name='gpt2-124M'`

			`# these make the total batch size be ~0.5M`
			`# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520`
			`batch_size = 12`
			`block_size = 1024`
			`gradient_accumulation_steps = 5`

			`# this makes total number of tokens be 300B`
			`max_iters = 600000`
			`lr_decay_iters = 600000`

			`# eval stuff`
			`eval_interval = 1000`
			`eval_iters = 200`
			`log_interval = 10`

			`# weight decay`
			`weight_decay = 1e-1`