diff --git a/train.py b/train.py index 78cd1c1..f1b0bc1 100644 --- a/train.py +++ b/train.py @@ -9,10 +9,11 @@ To run with DDP on 4 gpus on 1 node, example: $ torchrun --standalone --nproc_per_node=4 train.py To run with DDP on 4 gpus across 2 nodes, example: -- Run on the first (master) node: +- Run on the first (master) node with example IP 123.456.123.456: $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py - Run on the worker node: $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py +(If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1) """ import os @@ -79,11 +80,11 @@ config = {k: globals()[k] for k in config_keys} # will be useful for logging ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run? if ddp: init_process_group(backend=backend) - DDP_RANK = int(os.environ['RANK']) - DDP_LOCAL_RANK = int(os.environ['LOCAL_RANK']) - device = f'cuda:{DDP_LOCAL_RANK}' - master_process = DDP_RANK == 0 # this process will do logging, checkpointing etc. - seed_offset = DDP_RANK # each process gets a different seed + ddp_rank = int(os.environ['RANK']) + ddp_local_rank = int(os.environ['LOCAL_RANK']) + device = f'cuda:{ddp_local_rank}' + master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. + seed_offset = ddp_rank # each process gets a different seed else: # if not ddp, we are running on a single gpu, and one process master_process = True @@ -181,7 +182,7 @@ if compile: # wrap model into DDP container if ddp: - model = DDP(model, device_ids=[DDP_LOCAL_RANK]) + model = DDP(model, device_ids=[ddp_local_rank]) @torch.no_grad() def estimate_loss():