1
0
mirror of https://github.com/osmarks/nanogpt-experiments.git synced 2024-11-10 20:09:58 +00:00

small tweaks to notebook

This commit is contained in:
Andrej Karpathy 2023-01-06 02:13:04 +00:00
parent 69d1a5f1af
commit 27fc6a4112

View File

@ -5,12 +5,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Reproducing some results from [Chinchilla](https://arxiv.org/pdf/2203.15556.pdf). I can't get the numbers to match exactly, please open an Issue if you understand why the results are off by a bit. Current running hypothesis is that this is because I am using the FLOPs = 6\\*N\\*D formula, instead of taking all the Gopher sizes and calculating their FLOPs individually, and using that to interpolate?"
"Trying to reproduce results from [Chinchilla](https://arxiv.org/pdf/2203.15556.pdf):"
]
},
{
"cell_type": "code",
"execution_count": 117,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 118,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 119,
"execution_count": 3,
"metadata": {},
"outputs": [
{
@ -53,7 +53,7 @@
"123.653376"
]
},
"execution_count": 119,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@ -89,7 +89,7 @@
},
{
"cell_type": "code",
"execution_count": 123,
"execution_count": 4,
"metadata": {},
"outputs": [
{
@ -98,7 +98,7 @@
"[44000000.0, 512, 2048, 64, 8, 8]"
]
},
"execution_count": 123,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -112,7 +112,7 @@
},
{
"cell_type": "code",
"execution_count": 126,
"execution_count": 5,
"metadata": {},
"outputs": [
{
@ -154,7 +154,7 @@
},
{
"cell_type": "code",
"execution_count": 127,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -163,7 +163,7 @@
"766.006788096"
]
},
"execution_count": 127,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -185,7 +185,7 @@
" # key @ query logits\n",
" attlogits = 2 * seq_len * seq_len * (key_size * num_heads)\n",
" # softmax\n",
" attsoftmax = 3 * num_heads * seq_len * seq_len # TODO why?\n",
" attsoftmax = 3 * num_heads * seq_len * seq_len # 3* is for subtract (max), exp, divide (?)\n",
" # softmax @ value reductions\n",
" attvalue = 2 * seq_len * seq_len * (key_size * num_heads)\n",
" # final linear\n",
@ -208,7 +208,7 @@
},
{
"cell_type": "code",
"execution_count": 128,
"execution_count": 7,
"metadata": {},
"outputs": [
{
@ -260,16 +260,16 @@
},
{
"cell_type": "code",
"execution_count": 129,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.colorbar.Colorbar at 0x7f64c8d4b3d0>"
"<matplotlib.colorbar.Colorbar at 0x7fc09dffc730>"
]
},
"execution_count": 129,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
},
@ -330,7 +330,7 @@
},
{
"cell_type": "code",
"execution_count": 130,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -347,7 +347,7 @@
"Text(0, 0.5, 'loss')"
]
},
"execution_count": 130,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
},
@ -396,7 +396,7 @@
},
{
"cell_type": "code",
"execution_count": 131,
"execution_count": 10,
"metadata": {},
"outputs": [
{
@ -405,7 +405,7 @@
"2304"
]
},
"execution_count": 131,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -432,7 +432,7 @@
},
{
"cell_type": "code",
"execution_count": 132,
"execution_count": 11,
"metadata": {},
"outputs": [
{