Fix for gradient_accumulation_steps training slow

2025-11-26 03:54:53 +00:00 · 2023-03-25 00:04:45 -07:00
parent a82b33b525
commit 978d4fe538
3 changed files with 5 additions and 3 deletions
--- a/train.py
+++ b/train.py
@@ -45,7 +45,7 @@ wandb_project = 'owt'
 wandb_run_name = 'gpt2' # 'run' + str(time.time())
 # data
 dataset = 'openwebtext'
-gradient_accumulation_steps = 5 # used to simulate larger batch sizes
+gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
 batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
 block_size = 1024
 # model
@@ -88,11 +88,12 @@ if ddp:
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
    seed_offset = ddp_rank # each process gets a different seed
+    assert gradient_accumulation_steps % torch.cuda.device_count() == 0
+    gradient_accumulation_steps //= torch.cuda.device_count()
 else:
    # if not ddp, we are running on a single gpu, and one process
    master_process = True
    seed_offset = 0
-    gradient_accumulation_steps *= 8 # simulate 8 gpus

 if master_process:
    os.makedirs(out_dir, exist_ok=True)