use WORLD_SIZE instead of device_count, supports both the case where the number of gpus we train on is smaller than gpus available, and also multinode training may be a bugfix

2025-11-12 05:13:00 +00:00 · 2023-06-14 23:33:07 +00:00
parent f08abb45bd
commit 7339b904ef
1 changed files with 4 additions and 2 deletions
--- a/train.py
+++ b/train.py
@@ -89,8 +89,10 @@ if ddp:
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank # each process gets a different seed
-    assert gradient_accumulation_steps % torch.cuda.device_count() == 0
-    gradient_accumulation_steps //= torch.cuda.device_count()
+    # world_size number of processes will be training simultaneously, so we can scale
+    # down the desired gradient accumulation iterations per process proportionally
+    assert gradient_accumulation_steps % ddp_world_size == 0
+    gradient_accumulation_steps //= ddp_world_size
 else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True