From 43b37fd56817e0783c6904f98ae468bb0fbe7aa0 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 14 Jan 2023 02:16:10 +0000
Subject: [PATCH] reverse the order, making sure that the final layer init is
 preserved, and becomes the token embedding instead of the other way around.
 otherwise the loss can be all messed up from a bad init

---
 model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model.py b/model.py
index a301379..bf17829 100644
--- a/model.py
+++ b/model.py
@@ -115,7 +115,7 @@ class GPT(nn.Module):
             ln_f = nn.LayerNorm(config.n_embd),
         ))
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        self.lm_head.weight = self.transformer.wte.weight # https://paperswithcode.com/method/weight-tying
+        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
 
         # report number of parameters
         n_params = sum(p.numel() for p in self.parameters())