mirror of
				https://github.com/osmarks/nanogpt-experiments.git
				synced 2025-10-31 15:23:01 +00:00 
			
		
		
		
	docs for multinode ddp
This commit is contained in:
		
							
								
								
									
										10
									
								
								train.py
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								train.py
									
									
									
									
									
								
							| @@ -2,11 +2,15 @@ | ||||
| This training script can be run both on a single gpu in debug mode, | ||||
| and also in a larger training run with distributed data parallel (ddp). | ||||
|  | ||||
| To run in debug mode example: | ||||
| $ python train.py --batch_size=32 --other=args | ||||
| To run on a single GPU, example: | ||||
| $ python train.py --batch_size=32 --compile=False | ||||
|  | ||||
| To run DDP on 4 gpus on one node, example: | ||||
| To run with DDP on 4 gpus on 1 node, example: | ||||
| $ torchrun --standalone --nproc_per_node=4 train.py | ||||
|  | ||||
| To run with DDP on 4 gpus across 2 nodes, example: | ||||
| $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py | ||||
| $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py | ||||
| """ | ||||
|  | ||||
| import os | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Andrej Karpathy
					Andrej Karpathy