mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-10 20:09:58 +00:00
Merge pull request #225 from otaviogood/grad_accum
Fix for gradient_accumulation_steps training slow
This commit is contained in:
commit
21f9bff7e4
@ -10,7 +10,7 @@ wandb_run_name='gpt2-124M'
|
|||||||
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
|
# 12 batch size * 1024 block size * 5 gradaccum * 8 GPUs = 491,520
|
||||||
batch_size = 12
|
batch_size = 12
|
||||||
block_size = 1024
|
block_size = 1024
|
||||||
gradient_accumulation_steps = 5
|
gradient_accumulation_steps = 5 * 8
|
||||||
|
|
||||||
# this makes total number of tokens be 300B
|
# this makes total number of tokens be 300B
|
||||||
max_iters = 600000
|
max_iters = 600000
|
||||||
|
@ -14,6 +14,7 @@ wandb_project = 'shakespeare-char'
|
|||||||
wandb_run_name = 'mini-gpt'
|
wandb_run_name = 'mini-gpt'
|
||||||
|
|
||||||
dataset = 'shakespeare_char'
|
dataset = 'shakespeare_char'
|
||||||
|
gradient_accumulation_steps = 1
|
||||||
batch_size = 64
|
batch_size = 64
|
||||||
block_size = 256 # context of up to 256 previous characters
|
block_size = 256 # context of up to 256 previous characters
|
||||||
|
|
||||||
|
10
train.py
10
train.py
@ -45,7 +45,7 @@ wandb_project = 'owt'
|
|||||||
wandb_run_name = 'gpt2' # 'run' + str(time.time())
|
wandb_run_name = 'gpt2' # 'run' + str(time.time())
|
||||||
# data
|
# data
|
||||||
dataset = 'openwebtext'
|
dataset = 'openwebtext'
|
||||||
gradient_accumulation_steps = 5 # used to simulate larger batch sizes
|
gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
|
||||||
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
|
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
|
||||||
block_size = 1024
|
block_size = 1024
|
||||||
# model
|
# model
|
||||||
@ -84,16 +84,20 @@ if ddp:
|
|||||||
init_process_group(backend=backend)
|
init_process_group(backend=backend)
|
||||||
ddp_rank = int(os.environ['RANK'])
|
ddp_rank = int(os.environ['RANK'])
|
||||||
ddp_local_rank = int(os.environ['LOCAL_RANK'])
|
ddp_local_rank = int(os.environ['LOCAL_RANK'])
|
||||||
|
ddp_world_size = int(os.environ['WORLD_SIZE'])
|
||||||
device = f'cuda:{ddp_local_rank}'
|
device = f'cuda:{ddp_local_rank}'
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
||||||
seed_offset = ddp_rank # each process gets a different seed
|
seed_offset = ddp_rank # each process gets a different seed
|
||||||
|
assert gradient_accumulation_steps % torch.cuda.device_count() == 0
|
||||||
|
gradient_accumulation_steps //= torch.cuda.device_count()
|
||||||
else:
|
else:
|
||||||
# if not ddp, we are running on a single gpu, and one process
|
# if not ddp, we are running on a single gpu, and one process
|
||||||
master_process = True
|
master_process = True
|
||||||
seed_offset = 0
|
seed_offset = 0
|
||||||
gradient_accumulation_steps *= 8 # simulate 8 gpus
|
ddp_world_size = 1
|
||||||
print("total number of tokens per iteration:", batch_size * block_size * gradient_accumulation_steps)
|
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
|
||||||
|
print(f"tokens per iteration will be: {tokens_per_iter:,}")
|
||||||
|
|
||||||
if master_process:
|
if master_process:
|
||||||
os.makedirs(out_dir, exist_ok=True)
|
os.makedirs(out_dir, exist_ok=True)
|
||||||
|
Loading…
Reference in New Issue
Block a user