1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-12-18 14:10:28 +00:00

Fix for gradient_accumulation_steps training slow

This commit is contained in:
Otavio Good 2023-03-25 00:04:45 -07:00
parent a82b33b525
commit 978d4fe538
3 changed files with 5 additions and 3 deletions

View File

@ -10,7 +10,7 @@ wandb_run_name='gpt2-124M'
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520 # 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
batch_size = 12 batch_size = 12
block_size = 1024 block_size = 1024
gradient_accumulation_steps = 5 gradient_accumulation_steps = 5 * 8
# this makes total number of tokens be 300B # this makes total number of tokens be 300B
max_iters = 600000 max_iters = 600000

View File

@ -14,6 +14,7 @@ wandb_project = 'shakespeare-char'
wandb_run_name = 'mini-gpt' wandb_run_name = 'mini-gpt'
dataset = 'shakespeare_char' dataset = 'shakespeare_char'
gradient_accumulation_steps = 1
batch_size = 64 batch_size = 64
block_size = 256 # context of up to 256 previous characters block_size = 256 # context of up to 256 previous characters

View File

@ -45,7 +45,7 @@ wandb_project = 'owt'
wandb_run_name = 'gpt2' # 'run' + str(time.time()) wandb_run_name = 'gpt2' # 'run' + str(time.time())
# data # data
dataset = 'openwebtext' dataset = 'openwebtext'
gradient_accumulation_steps = 5 # used to simulate larger batch sizes gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
block_size = 1024 block_size = 1024
# model # model
@ -88,11 +88,12 @@ if ddp:
torch.cuda.set_device(device) torch.cuda.set_device(device)
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
seed_offset = ddp_rank # each process gets a different seed seed_offset = ddp_rank # each process gets a different seed
assert gradient_accumulation_steps % torch.cuda.device_count() == 0
gradient_accumulation_steps //= torch.cuda.device_count()
else: else:
# if not ddp, we are running on a single gpu, and one process # if not ddp, we are running on a single gpu, and one process
master_process = True master_process = True
seed_offset = 0 seed_offset = 0
gradient_accumulation_steps *= 8 # simulate 8 gpus
if master_process: if master_process:
os.makedirs(out_dir, exist_ok=True) os.makedirs(out_dir, exist_ok=True)