diff --git a/train.py b/train.py index e69553d..e9cdf1e 100644 --- a/train.py +++ b/train.py @@ -2,11 +2,15 @@ This training script can be run both on a single gpu in debug mode, and also in a larger training run with distributed data parallel (ddp). -To run in debug mode example: -$ python train.py --batch_size=32 --other=args +To run on a single GPU, example: +$ python train.py --batch_size=32 --compile=False -To run DDP on 4 gpus on one node, example: +To run with DDP on 4 gpus on 1 node, example: $ torchrun --standalone --nproc_per_node=4 train.py + +To run with DDP on 4 gpus across 2 nodes, example: +$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py +$ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py """ import os