From 0a2ea9533844be1ec7e289ac392553e9f050be36 Mon Sep 17 00:00:00 2001
From: Laiho <emillaiho@hotmail.fi>
Date: Mon, 2 Jan 2023 17:49:21 +0200
Subject: [PATCH 01/11] batch file write

---
 data/openwebtext/prepare.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/data/openwebtext/prepare.py b/data/openwebtext/prepare.py
index 0aadbb8..4f31017 100644
--- a/data/openwebtext/prepare.py
+++ b/data/openwebtext/prepare.py
@@ -53,12 +53,16 @@ for split, dset in tokenized.items():
     filename = f'{split}.bin'
     dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
     arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
+    total_batches = 1024
 
-    print(f"writing {filename}...")
     idx = 0
-    for example in tqdm(dset):
-        arr[idx : idx + example['len']] = example['ids']
-        idx += example['len']
+    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
+        # Batch together samples for faster write
+        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
+        arr_batch = np.concatenate(batch['ids'])
+        # Write into mmap
+        arr[idx : idx + len(arr_batch)] = arr_batch
+        idx += len(arr_batch)
     arr.flush()
 
 # train.bin is ~17GB, val.bin ~8.5MB

From d5ee965974b929eed1ffbc9e874f724a53234578 Mon Sep 17 00:00:00 2001
From: MicroPanda123 <hejkamila@proton.me>
Date: Sun, 15 Jan 2023 20:29:15 +0000
Subject: [PATCH 02/11] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e9b045..28476bf 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ Dependencies:
 - `pip install datasets` for huggingface datasets <3 (if you want to download + preprocess OpenWebText)
 - `pip install tiktoken` for OpenAI's fast BPE code <3
 - `pip install wandb` for optional logging <3
-- `pip install tqdm`
+- `pip install tqdm` <3
 
 ## usage
 

From f9d8020f48b3a670e507bfe6bb0bd26507e71276 Mon Sep 17 00:00:00 2001
From: venusatuluri <venusatuluri@gmail.com>
Date: Sat, 21 Jan 2023 06:14:16 +0000
Subject: [PATCH 03/11] Fix decode fn in shakespeare_char/prepare.py

---
 data/shakespeare_char/prepare.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/data/shakespeare_char/prepare.py b/data/shakespeare_char/prepare.py
index 918d1ee..fd95329 100644
--- a/data/shakespeare_char/prepare.py
+++ b/data/shakespeare_char/prepare.py
@@ -31,7 +31,7 @@ itos = { i:ch for i,ch in enumerate(chars) }
 def encode(s):
     return [stoi[c] for c in s] # encoder: take a string, output a list of integers
 def decode(l):
-    ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
 
 # create the train and test splits
 n = len(data)

From 27a5d6f1230e45e298645626a0f65243de2ea33a Mon Sep 17 00:00:00 2001
From: Abraham Sangha <abraham.sangha@broadcom.com>
Date: Tue, 7 Feb 2023 11:02:20 -0700
Subject: [PATCH 04/11] fix typos

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e27662c..0a4e4a4 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ This creates a `train.bin` and `val.bin` in that data directory. Now it is time
 $ python train.py config/train_shakespeare_char.py
 ```
 
-If you peak inside it, you'll see that we're training a GPT with a context size of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer with 6 heads in each layer. On one A100 GPU this training run takes about 3 minutes and the best validation loss is 1.4697. Based on the configuration, the model checkpoints are being written into the `--out_dir` directory `out-shakespeare-char`. So once the training finishes we can sample from the best model by pointing the sampling script at this directory:
+If you peek inside it, you'll see that we're training a GPT with a context size of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer with 6 heads in each layer. On one A100 GPU this training run takes about 3 minutes and the best validation loss is 1.4697. Based on the configuration, the model checkpoints are being written into the `--out_dir` directory `out-shakespeare-char`. So once the training finishes we can sample from the best model by pointing the sampling script at this directory:
 
 ```
 $ python sample.py --out_dir=out-shakespeare-char
@@ -84,7 +84,7 @@ bot thou the sought bechive in that to doth groan you,
 No relving thee post mose the wear
 ```
 
-Not bad for ~3 minutes on a CPU, for a hint of the right character gestalt. If you're willing to wait longer free to tune the hyperparameters, increase the size of the network, the context length (`--block_size`), the length of training, etc.
+Not bad for ~3 minutes on a CPU, for a hint of the right character gestalt. If you're willing to wait longer, feel free to tune the hyperparameters, increase the size of the network, the context length (`--block_size`), the length of training, etc.
 
 Finally, on Apple Silicon Macbooks and with a recent PyTorch version make sure to add `--device mps` (short for "Metal Performance Shaders"); PyTorch then uses the on-chip Neural Engine that can *significantly* accelerate training (2-3X) and allow you to use larger networks. See [Issue 28](https://github.com/karpathy/nanoGPT/issues/28) for more.
 

From c2531159c7077f01c9e63134db2b1a7401ad3b05 Mon Sep 17 00:00:00 2001
From: kovkev <30991911+kovkev@users.noreply.github.com>
Date: Sat, 11 Feb 2023 17:13:24 -0800
Subject: [PATCH 05/11] Fix the position of a comma

---
 model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 1b32cdf..71ddadb 100644
--- a/model.py
+++ b/model.py
@@ -61,7 +61,7 @@ class CausalSelfAttention(nn.Module):
         B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
 
         # calculate query, key, values for all heads in batch and move head forward to be the batch dim
-        q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
         k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
         q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
         v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

From 10046a2ec0c2d16d48ed729066c3db396dc5d2c0 Mon Sep 17 00:00:00 2001
From: Lutz Roeder <lutzroeder@users.noreply.github.com>
Date: Mon, 13 Feb 2023 13:57:20 -0800
Subject: [PATCH 06/11] Add .gitignore

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..70f21ff
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+.DS_Store
+.ipynb_checkpoints/
+__pycache__/
+*.pyc

From c3f254844d63ece0b6481e9d9777d740a66eb965 Mon Sep 17 00:00:00 2001
From: Kirill <iam@python273.pw>
Date: Fri, 24 Mar 2023 14:51:02 +0300
Subject: [PATCH 07/11] Fix GPT.crop_block_size when flash attention is
 available

---
 model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 0858f80..c9a6d7b 100644
--- a/model.py
+++ b/model.py
@@ -207,7 +207,8 @@ class GPT(nn.Module):
         self.config.block_size = block_size
         self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
         for block in self.transformer.h:
-            block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
+            if hasattr(block.attn, 'bias'):
+                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
 
     @classmethod
     def from_pretrained(cls, model_type, override_args=None):

From c58fc4605cdcdc68b26a330a8a621c02e681e7d4 Mon Sep 17 00:00:00 2001
From: Snehal Raj <snehalraj808@gmail.com>
Date: Sat, 25 Mar 2023 20:36:46 +0100
Subject: [PATCH 08/11] fix small typo

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8ddef7c..243da36 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ This creates a `train.bin` and `val.bin` in that data directory. Now it is time
 $ python train.py config/train_shakespeare_char.py
 ```
 
-If you peak inside it, you'll see that we're training a GPT with a context size of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer with 6 heads in each layer. On one A100 GPU this training run takes about 3 minutes and the best validation loss is 1.4697. Based on the configuration, the model checkpoints are being written into the `--out_dir` directory `out-shakespeare-char`. So once the training finishes we can sample from the best model by pointing the sampling script at this directory:
+If you peek inside it, you'll see that we're training a GPT with a context size of up to 256 characters, 384 feature channels, and it is a 6-layer Transformer with 6 heads in each layer. On one A100 GPU this training run takes about 3 minutes and the best validation loss is 1.4697. Based on the configuration, the model checkpoints are being written into the `--out_dir` directory `out-shakespeare-char`. So once the training finishes we can sample from the best model by pointing the sampling script at this directory:
 
 ```
 $ python sample.py --out_dir=out-shakespeare-char

From 4ac2e8ce3a47a38789dab355355ff822672a6d45 Mon Sep 17 00:00:00 2001
From: ymurenko <murenko.yuri@gmail.com>
Date: Wed, 5 Apr 2023 17:28:55 -0400
Subject: [PATCH 09/11] fix "cuda out of memory" when resuming training

---
 train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/train.py b/train.py
index 30d0145..3c40524 100644
--- a/train.py
+++ b/train.py
@@ -189,6 +189,7 @@ scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
 optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
 if init_from == 'resume':
     optimizer.load_state_dict(checkpoint['optimizer'])
+checkpoint = None # free up memory
 
 # compile the model
 if compile:

From 7399dfe39d24579cc1fb391475864ad28f19f6f7 Mon Sep 17 00:00:00 2001
From: Yassine Yousfi <yyousfi1@binghamton.edu>
Date: Mon, 10 Apr 2023 22:56:22 -0700
Subject: [PATCH 10/11] dont always dropout!

---
 model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model.py b/model.py
index 0858f80..287aadd 100644
--- a/model.py
+++ b/model.py
@@ -69,7 +69,7 @@ class CausalSelfAttention(nn.Module):
         # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
         if self.flash:
             # efficient attention using Flash Attention CUDA kernels
-            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
         else:
             # manual implementation of attention
             att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))

From 553f949f46e04a946e5ff2da6e4bb70a221ea1fc Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 13 Apr 2023 04:59:11 +0000
Subject: [PATCH 11/11] fix minor bug where we have to scale the loss to
 account for gradient accumulation, which sums before backprop. note that this
 is not a major bug because AdamW is scale invariant. however, this did affect
 gradient clipping

---
 train.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/train.py b/train.py
index 30d0145..6410de4 100644
--- a/train.py
+++ b/train.py
@@ -93,6 +93,7 @@ else:
     master_process = True
     seed_offset = 0
     gradient_accumulation_steps *= 8 # simulate 8 gpus
+print("total number of tokens per iteration:", batch_size * block_size * gradient_accumulation_steps)
 
 if master_process:
     os.makedirs(out_dir, exist_ok=True)
@@ -287,6 +288,7 @@ while True:
             model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
         with ctx:
             logits, loss = model(X, Y)
+            loss = loss / gradient_accumulation_steps # scale the loss to account for gradient accumulation
         # immediately async prefetch next batch while model is doing the forward pass on the GPU
         X, Y = get_batch('train')
         # backward pass, with gradient scaling if training in fp16
@@ -306,7 +308,9 @@ while True:
     dt = t1 - t0
     t0 = t1
     if iter_num % log_interval == 0 and master_process:
-        lossf = loss.item() # loss as float. note: this is a CPU-GPU sync point
+        # get loss as float. note: this is a CPU-GPU sync point
+        # scale up to undo the division above, approximating the true total loss (exact would have been a sum)
+        lossf = loss.item() * gradient_accumulation_steps
         if local_iter_num >= 5: # let the training loop settle a bit
             mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
             running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu