mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2025-11-26 03:54:53 +00:00
Fix for gradient_accumulation_steps training slow
This commit is contained in:
5
train.py
5
train.py
@@ -45,7 +45,7 @@ wandb_project = 'owt'
|
||||
wandb_run_name = 'gpt2' # 'run' + str(time.time())
|
||||
# data
|
||||
dataset = 'openwebtext'
|
||||
gradient_accumulation_steps = 5 # used to simulate larger batch sizes
|
||||
gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
|
||||
batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
|
||||
block_size = 1024
|
||||
# model
|
||||
@@ -88,11 +88,12 @@ if ddp:
|
||||
torch.cuda.set_device(device)
|
||||
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
||||
seed_offset = ddp_rank # each process gets a different seed
|
||||
assert gradient_accumulation_steps % torch.cuda.device_count() == 0
|
||||
gradient_accumulation_steps //= torch.cuda.device_count()
|
||||
else:
|
||||
# if not ddp, we are running on a single gpu, and one process
|
||||
master_process = True
|
||||
seed_offset = 0
|
||||
gradient_accumulation_steps *= 8 # simulate 8 gpus
|
||||
|
||||
if master_process:
|
||||
os.makedirs(out_dir, exist_ok=True)
|
||||
|
||||
Reference in New Issue
Block a user