mirror of
				https://github.com/osmarks/nanogpt-experiments.git
				synced 2025-10-31 07:13:01 +00:00 
			
		
		
		
	small tweaks to docs and variable names stylistically
This commit is contained in:
		
							
								
								
									
										15
									
								
								train.py
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								train.py
									
									
									
									
									
								
							| @@ -9,10 +9,11 @@ To run with DDP on 4 gpus on 1 node, example: | |||||||
| $ torchrun --standalone --nproc_per_node=4 train.py | $ torchrun --standalone --nproc_per_node=4 train.py | ||||||
|  |  | ||||||
| To run with DDP on 4 gpus across 2 nodes, example: | To run with DDP on 4 gpus across 2 nodes, example: | ||||||
| - Run on the first (master) node: | - Run on the first (master) node with example IP 123.456.123.456: | ||||||
| $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py | $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr=123.456.123.456 --master_port=1234 train.py | ||||||
| - Run on the worker node: | - Run on the worker node: | ||||||
| $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py | $ torchrun --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr=123.456.123.456 --master_port=1234 train.py | ||||||
|  | (If your cluster does not have Infiniband interconnect prepend NCCL_IB_DISABLE=1) | ||||||
| """ | """ | ||||||
|  |  | ||||||
| import os | import os | ||||||
| @@ -79,11 +80,11 @@ config = {k: globals()[k] for k in config_keys} # will be useful for logging | |||||||
| ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run? | ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run? | ||||||
| if ddp: | if ddp: | ||||||
|     init_process_group(backend=backend) |     init_process_group(backend=backend) | ||||||
|     DDP_RANK = int(os.environ['RANK']) |     ddp_rank = int(os.environ['RANK']) | ||||||
|     DDP_LOCAL_RANK = int(os.environ['LOCAL_RANK']) |     ddp_local_rank = int(os.environ['LOCAL_RANK']) | ||||||
|     device = f'cuda:{DDP_LOCAL_RANK}' |     device = f'cuda:{ddp_local_rank}' | ||||||
|     master_process = DDP_RANK == 0 # this process will do logging, checkpointing etc. |     master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. | ||||||
|     seed_offset = DDP_RANK # each process gets a different seed |     seed_offset = ddp_rank # each process gets a different seed | ||||||
| else: | else: | ||||||
|     # if not ddp, we are running on a single gpu, and one process |     # if not ddp, we are running on a single gpu, and one process | ||||||
|     master_process = True |     master_process = True | ||||||
| @@ -181,7 +182,7 @@ if compile: | |||||||
|  |  | ||||||
| # wrap model into DDP container | # wrap model into DDP container | ||||||
| if ddp: | if ddp: | ||||||
|     model = DDP(model, device_ids=[DDP_LOCAL_RANK]) |     model = DDP(model, device_ids=[ddp_local_rank]) | ||||||
|  |  | ||||||
| @torch.no_grad() | @torch.no_grad() | ||||||
| def estimate_loss(): | def estimate_loss(): | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Andrej Karpathy
					Andrej Karpathy