diff --git a/model.py b/model.py index 1b32cdf..71ddadb 100644 --- a/model.py +++ b/model.py @@ -61,7 +61,7 @@ class CausalSelfAttention(nn.Module): B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd) # calculate query, key, values for all heads in batch and move head forward to be the batch dim - q, k ,v = self.c_attn(x).split(self.n_embd, dim=2) + q, k, v = self.c_attn(x).split(self.n_embd, dim=2) k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs) v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)