From 1f77d03024005295286f04862ac6f0edbc28c663 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 20 Jan 2023 21:28:20 +0000 Subject: [PATCH] make mentions of mps in docs. ty good people in issue #28 --- README.md | 2 ++ train.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f08b56..59f7de6 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,8 @@ $ python train.py config/train_shakespeare_char.py --device=cpu --compile=False Where we decrease the context length to just 64 characters and only use a batch size of 8. +Finally, on Apple Silicon Macbooks you can use device `--device mps` ("Metal Performance Shaders"), which can significantly accelerate training (2-3X). You will need a specific version of PyTorch. See [Issue 28](https://github.com/karpathy/nanoGPT/issues/28). + ## benchmarking For model benchmarking `bench.py` might be useful. It's identical to what happens in the meat of the training loop of `train.py`, but omits much of the other complexities. diff --git a/train.py b/train.py index f1b0bc1..1d52f17 100644 --- a/train.py +++ b/train.py @@ -67,7 +67,7 @@ min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchi # DDP settings backend = 'nccl' # 'nccl', 'gloo', etc. # system -device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. +device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks dtype = 'bfloat16' # 'float32' or 'bfloat16' compile = True # use PyTorch 2.0 to compile the model to be faster # -----------------------------------------------------------------------------