mirror of
https://github.com/osmarks/nanogpt-experiments.git
synced 2024-11-10 20:09:58 +00:00
dont always dropout!
This commit is contained in:
parent
a82b33b525
commit
7399dfe39d
2
model.py
2
model.py
@ -69,7 +69,7 @@ class CausalSelfAttention(nn.Module):
|
|||||||
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
||||||
if self.flash:
|
if self.flash:
|
||||||
# efficient attention using Flash Attention CUDA kernels
|
# efficient attention using Flash Attention CUDA kernels
|
||||||
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout, is_causal=True)
|
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
|
||||||
else:
|
else:
|
||||||
# manual implementation of attention
|
# manual implementation of attention
|
||||||
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
||||||
|
Loading…
Reference in New Issue
Block a user