mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-12-18 14:10:28 +00:00
use WORLD_SIZE instead of device_count, supports both the case where the number of gpus we train on is smaller than gpus available, and also multinode training may be a bugfix
This commit is contained in:
parent
f08abb45bd
commit
7339b904ef
6
train.py
6
train.py
@ -89,8 +89,10 @@ if ddp:
|
|||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
||||||
seed_offset = ddp_rank # each process gets a different seed
|
seed_offset = ddp_rank # each process gets a different seed
|
||||||
assert gradient_accumulation_steps % torch.cuda.device_count() == 0
|
# world_size number of processes will be training simultaneously, so we can scale
|
||||||
gradient_accumulation_steps //= torch.cuda.device_count()
|
# down the desired gradient accumulation iterations per process proportionally
|
||||||
|
assert gradient_accumulation_steps % ddp_world_size == 0
|
||||||
|
gradient_accumulation_steps //= ddp_world_size
|
||||||
else:
|
else:
|
||||||
# if not ddp, we are running on a single gpu, and one process
|
# if not ddp, we are running on a single gpu, and one process
|
||||||
master_process = True
|
master_process = True
|
||||||
|
Loading…
Reference in New Issue
Block a user