Merge pull request #240 from YassineYousfi/master

don't dropout in eval mode
2024-12-18 06:00:29 +00:00 · 2023-04-12 22:43:59 -07:00 · 2023-04-12 22:43:59 -07:00 · 01e48ec1ab
commit 01e48ec1ab
parent 7840a66859 7399dfe39d
1 changed files with 1 additions and 1 deletions
--- a/model.py
+++ b/model.py
@ -69,7 +69,7 @@ class CausalSelfAttention(nn.Module):
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
-            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))