1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-10 20:09:58 +00:00

use WORLD_SIZE instead of device_count, supports both the case where the number of gpus we train on is smaller than gpus available, and also multinode training may be a bugfix

This commit is contained in:
Andrej Karpathy 2023-06-14 23:33:07 +00:00
parent f08abb45bd
commit 7339b904ef

View File

@ -89,8 +89,10 @@ if ddp:
torch.cuda.set_device(device) torch.cuda.set_device(device)
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
seed_offset = ddp_rank # each process gets a different seed seed_offset = ddp_rank # each process gets a different seed
assert gradient_accumulation_steps % torch.cuda.device_count() == 0 # world_size number of processes will be training simultaneously, so we can scale
gradient_accumulation_steps //= torch.cuda.device_count() # down the desired gradient accumulation iterations per process proportionally
assert gradient_accumulation_steps % ddp_world_size == 0
gradient_accumulation_steps //= ddp_world_size
else: else:
# if not ddp, we are running on a single gpu, and one process # if not ddp, we are running on a single gpu, and one process
master_process = True master_process = True