mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2025-09-01 18:37:58 +00:00
Fix for gradient_accumulation_steps training slow
This commit is contained in:
@@ -10,7 +10,7 @@ wandb_run_name='gpt2-124M'
|
||||
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
|
||||
batch_size = 12
|
||||
block_size = 1024
|
||||
gradient_accumulation_steps = 5
|
||||
gradient_accumulation_steps = 5 * 8
|
||||
|
||||
# this makes total number of tokens be 300B
|
||||
max_iters = 600000
|
||||
|
@@ -14,6 +14,7 @@ wandb_project = 'shakespeare-char'
|
||||
wandb_run_name = 'mini-gpt'
|
||||
|
||||
dataset = 'shakespeare_char'
|
||||
gradient_accumulation_steps = 1
|
||||
batch_size = 64
|
||||
block_size = 256 # context of up to 256 previous characters
|
||||
|
||||
|
Reference in New Issue
Block a user