aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--transformer_shortest_paths.ipynb997
1 files changed, 250 insertions, 747 deletions
diff --git a/transformer_shortest_paths.ipynb b/transformer_shortest_paths.ipynb
index b2cea24..a158fcb 100644
--- a/transformer_shortest_paths.ipynb
+++ b/transformer_shortest_paths.ipynb
@@ -16,12 +16,13 @@
"Question: \n",
"\n",
"- Do the attention heads learn to attend to the same positional encodings\n",
- "- do interp -- what is it doing? can we figure out?"
+ "- do interp -- what is it doing? can we figure out?\n",
+ "- update: I think we should do interp once it's bigger. "
]
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -31,18 +32,18 @@
},
"outputs": [],
"source": [
- "from collections import deque\n",
"# using tqdm.auto glitches out collaborative editing\n",
"from tqdm import tqdm\n",
- "\n",
"import torch\n",
"import torch.nn as nn\n",
- "from math import sqrt\n",
"import matplotlib.pyplot as plt\n",
- "torch.manual_seed(42)\n",
"\n",
+ "from math import sqrt\n",
+ "from collections import deque\n",
"import os\n",
"import random\n",
+ "\n",
+ "torch.manual_seed(42)\n",
"random.seed(42)\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
@@ -51,18 +52,18 @@
},
{
"cell_type": "code",
- "execution_count": 91,
+ "execution_count": 2,
"metadata": {
"id": "lylOX2POPwFL"
},
"outputs": [],
"source": [
"# VTXS numbers here are inclusive\n",
- "MIN_VTXS = 3 # 3\n",
- "MAX_VTXS = 3 # 8\n",
- "MAX_TUNE_VTXS = 3 # 15\n",
+ "MIN_VTXS = 3\n",
+ "MAX_VTXS = 31\n",
+ "MAX_TUNE_VTXS = 15 # 15\n",
"AVG_DEG = 2\n",
- "SEQ_LEN = MAX_VTXS * AVG_DEG + 1 # means 32 edges, final token is the target vertex\n",
+ "SEQ_LEN = MAX_VTXS + 1 # means 32 edges, final token is the target vertex\n",
"PAD_TOKEN = 0\n",
"# vertices are labelled 1,2,...,63\n",
"# we also have a padding token which is 0."
@@ -79,7 +80,7 @@
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -102,16 +103,15 @@
"def random_graph(n):\n",
" edge_list = []\n",
" adjacencies = [set() for _ in range(n+1)]\n",
- " indices = [random.randint(1, n) for _ in range(AVG_DEG * (n-1))]\n",
+ " indices = [random.randint(1, n) for _ in range(AVG_DEG * n)]\n",
" for i in range(0, len(indices), 2):\n",
" u = indices[i]\n",
" v = indices[i + 1]\n",
" if u != v:\n",
- " edge_list += [u,v]\n",
+ " edge_list += [min(u,v),max(u,v)]\n",
" adjacencies[u].add(v)\n",
" adjacencies[v].add(u)\n",
- "\n",
- " edge_list += [PAD_TOKEN]*(SEQ_LEN-len(edge_list))\n",
+ " edge_list += [PAD_TOKEN]*(2*SEQ_LEN-1-len(edge_list))\n",
" return edge_list, adjacencies\n",
"\n",
"\"\"\"\n",
@@ -200,27 +200,71 @@
},
{
"cell_type": "code",
- "execution_count": 86,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(tensor([[4, 3, 2, 3, 1, 2, 0, 0, 2],\n",
- " [3, 4, 3, 1, 2, 3, 0, 0, 2],\n",
- " [4, 2, 3, 2, 0, 0, 0, 0, 2],\n",
- " [3, 4, 4, 3, 0, 0, 0, 0, 2],\n",
- " [1, 2, 2, 4, 0, 0, 0, 0, 2]], device='cuda:0'),\n",
- " tensor([1., 2., 4., 4., 1.], device='cuda:0', dtype=torch.bfloat16),\n",
- " tensor([[False, False, False, False, False, False, True, True, False],\n",
- " [False, False, False, False, False, False, True, True, False],\n",
- " [False, False, False, False, True, True, True, True, False],\n",
- " [False, False, False, False, True, True, True, True, False],\n",
- " [False, False, False, False, True, True, True, True, False]],\n",
- " device='cuda:0'))"
+ "(tensor([[ 1, 4, 8, 9, 5, 8, 4, 22, 3, 18, 14, 19, 1, 2, 3, 7, 8, 17,\n",
+ " 1, 20, 7, 18, 21, 23, 18, 23, 8, 14, 15, 19, 1, 9, 6, 23, 11, 14,\n",
+ " 5, 9, 7, 11, 3, 4, 4, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 2],\n",
+ " [ 2, 9, 15, 18, 4, 13, 3, 18, 10, 21, 12, 20, 7, 19, 2, 3, 8, 22,\n",
+ " 3, 10, 4, 8, 9, 13, 15, 21, 6, 12, 7, 22, 9, 22, 3, 21, 20, 21,\n",
+ " 6, 18, 6, 8, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 2],\n",
+ " [ 9, 11, 4, 11, 1, 6, 1, 4, 6, 7, 2, 5, 4, 10, 4, 6, 8, 11,\n",
+ " 7, 11, 3, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 2],\n",
+ " [ 3, 4, 5, 10, 7, 10, 6, 7, 3, 4, 8, 9, 1, 2, 2, 3, 3, 11,\n",
+ " 7, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 2],\n",
+ " [ 3, 13, 13, 20, 15, 17, 9, 18, 1, 22, 4, 22, 9, 18, 11, 21, 4, 10,\n",
+ " 6, 14, 1, 15, 9, 17, 6, 17, 4, 21, 10, 21, 17, 20, 5, 7, 6, 12,\n",
+ " 17, 18, 1, 20, 11, 16, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+ " 0, 0, 0, 0, 0, 0, 0, 0, 2]], device='cuda:0'),\n",
+ " tensor([ 1., 22., 11., 1., 22.], device='cuda:0', dtype=torch.bfloat16),\n",
+ " tensor([[False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, False],\n",
+ " [False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, False],\n",
+ " [False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, False],\n",
+ " [False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, False],\n",
+ " [False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, False, False, False, False, False, False,\n",
+ " False, False, False, False, True, True, True, True, True, True,\n",
+ " True, True, True, True, True, True, True, True, True, True,\n",
+ " True, True, False]], device='cuda:0'))"
]
},
- "execution_count": 86,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -231,41 +275,39 @@
},
{
"cell_type": "code",
- "execution_count": 87,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(array([ 663., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 284., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 39., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 1062.]),\n",
- " array([1. , 1.046875, 1.09375 , 1.140625, 1.1875 , 1.234375,\n",
- " 1.28125 , 1.328125, 1.375 , 1.421875, 1.46875 , 1.515625,\n",
- " 1.5625 , 1.609375, 1.65625 , 1.703125, 1.75 , 1.796875,\n",
- " 1.84375 , 1.890625, 1.9375 , 1.984375, 2.03125 , 2.078125,\n",
- " 2.125 , 2.171875, 2.21875 , 2.265625, 2.3125 , 2.359375,\n",
- " 2.40625 , 2.453125, 2.5 , 2.546875, 2.59375 , 2.640625,\n",
- " 2.6875 , 2.734375, 2.78125 , 2.828125, 2.875 , 2.921875,\n",
- " 2.96875 , 3.015625, 3.0625 , 3.109375, 3.15625 , 3.203125,\n",
- " 3.25 , 3.296875, 3.34375 , 3.390625, 3.4375 , 3.484375,\n",
- " 3.53125 , 3.578125, 3.625 , 3.671875, 3.71875 , 3.765625,\n",
- " 3.8125 , 3.859375, 3.90625 , 3.953125, 4. ]),\n",
+ "(array([320., 0., 310., 0., 264., 0., 178., 0., 119., 0., 90.,\n",
+ " 0., 69., 0., 42., 0., 0., 39., 0., 30., 0., 30.,\n",
+ " 0., 31., 0., 26., 0., 28., 0., 27., 0., 0., 20.,\n",
+ " 0., 30., 0., 29., 0., 26., 0., 20., 0., 30., 0.,\n",
+ " 34., 0., 23., 0., 0., 35., 0., 29., 0., 17., 0.,\n",
+ " 26., 0., 33., 0., 35., 0., 32., 0., 26.]),\n",
+ " array([ 1. , 1.46875, 1.9375 , 2.40625, 2.875 , 3.34375,\n",
+ " 3.8125 , 4.28125, 4.75 , 5.21875, 5.6875 , 6.15625,\n",
+ " 6.625 , 7.09375, 7.5625 , 8.03125, 8.5 , 8.96875,\n",
+ " 9.4375 , 9.90625, 10.375 , 10.84375, 11.3125 , 11.78125,\n",
+ " 12.25 , 12.71875, 13.1875 , 13.65625, 14.125 , 14.59375,\n",
+ " 15.0625 , 15.53125, 16. , 16.46875, 16.9375 , 17.40625,\n",
+ " 17.875 , 18.34375, 18.8125 , 19.28125, 19.75 , 20.21875,\n",
+ " 20.6875 , 21.15625, 21.625 , 22.09375, 22.5625 , 23.03125,\n",
+ " 23.5 , 23.96875, 24.4375 , 24.90625, 25.375 , 25.84375,\n",
+ " 26.3125 , 26.78125, 27.25 , 27.71875, 28.1875 , 28.65625,\n",
+ " 29.125 , 29.59375, 30.0625 , 30.53125, 31. ]),\n",
" <BarContainer object of 64 artists>)"
]
},
- "execution_count": 87,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
- "image/png": "",
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAigAAAGdCAYAAAA44ojeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAi0ElEQVR4nO3df1BVdeL/8RegXH9eCBUurIioJZJiLSneqVxXWH7IurrSjJZb1Do6utCsUqY0pmU7i2tNP4d0drbVmpEsd1JHK0sxcVtRk3L8VYw6tNjKhVZHrmLgD873jx3v93MTlavofd/r8zFzZrj3vO+973M8E8/OPfcSYlmWJQAAAIOE+nsCAAAAP0WgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADBOJ39P4Hq0trbq+PHj6tmzp0JCQvw9HQAA0A6WZen06dOKi4tTaOjVz5EEZKAcP35c8fHx/p4GAAC4DseOHVPfvn2vOiYgA6Vnz56S/reBdrvdz7MBAADt4Xa7FR8f7/k9fjUBGSiX3tax2+0ECgAAAaY9l2dwkSwAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIzTyd8TMFH/+R9dcd13S3Jv4UwAALg9cQYFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxuGr7jsQX5EPAEDH4AwKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIzjU6AsW7ZMKSkpstvtstvtcjqd+uSTTzzrm5ubVVBQoF69eqlHjx7Ky8tTfX2913PU1tYqNzdX3bp1U3R0tObOnasLFy50zNYAAICg4FOg9O3bV0uWLFFVVZX27NmjsWPHasKECTp48KAkac6cOdqwYYPWrFmjiooKHT9+XJMmTfI8/uLFi8rNzdW5c+e0Y8cOvfPOO1q5cqUWLlzYsVsFAAACWohlWdaNPEFUVJReeuklPfTQQ+rTp4/Kysr00EMPSZK+/fZbDRkyRJWVlRo1apQ++eQT/frXv9bx48cVExMjSVq+fLnmzZunH374QeHh4e16TbfbrYiICDU2Nsput9/I9Nt0vV+4xhe1AQBwZb78/r7ua1AuXryo1atXq6mpSU6nU1VVVTp//rwyMjI8Y5KSktSvXz9VVlZKkiorKzVs2DBPnEhSVlaW3G635yxMW1paWuR2u70WAAAQvHwOlP3796tHjx6y2WyaOXOm1q5dq+TkZLlcLoWHhysyMtJrfExMjFwulyTJ5XJ5xcml9ZfWXUlJSYkiIiI8S3x8vK/TBgAAAcTnQBk8eLD27t2rXbt2adasWcrPz9ehQ4duxtw8iouL1djY6FmOHTt2U18PAAD4l89/LDA8PFyDBg2SJKWmpurLL7/U66+/rsmTJ+vcuXM6deqU11mU+vp6ORwOSZLD4dDu3bu9nu/Sp3wujWmLzWaTzWbzdaoAACBA3fD3oLS2tqqlpUWpqanq3LmzysvLPeuqq6tVW1srp9MpSXI6ndq/f78aGho8YzZv3iy73a7k5OQbnQoAAAgSPp1BKS4uVk5Ojvr166fTp0+rrKxM27Zt06effqqIiAhNmzZNRUVFioqKkt1u15NPPimn06lRo0ZJkjIzM5WcnKxHH31US5culcvl0oIFC1RQUMAZEgAA4OFToDQ0NOixxx5TXV2dIiIilJKSok8//VS/+tWvJEmvvvqqQkNDlZeXp5aWFmVlZemtt97yPD4sLEwbN27UrFmz5HQ61b17d+Xn52vx4sUdu1UAACCg3fD3oPgD34MCAEDguSXfgwIAAHCzECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA43Ty9wQg9Z//0RXXfbck9xbOBAAAM3AGBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMbxKVBKSko0YsQI9ezZU9HR0Zo4caKqq6u9xowZM0YhISFey8yZM73G1NbWKjc3V926dVN0dLTmzp2rCxcu3PjWAACAoODT3+KpqKhQQUGBRowYoQsXLujZZ59VZmamDh06pO7du3vGTZ8+XYsXL/bc7tatm+fnixcvKjc3Vw6HQzt27FBdXZ0ee+wxde7cWX/+8587YJMAAECg8ylQNm3a5HV75cqVio6OVlVVlUaPHu25v1u3bnI4HG0+x2effaZDhw5py5YtiomJ0T333KMXX3xR8+bN0/PPP6/w8PDr2AwAABBMbugalMbGRklSVFSU1/2rVq1S7969NXToUBUXF+vs2bOedZWVlRo2bJhiYmI892VlZcntduvgwYNtvk5LS4vcbrfXAgAAgpdPZ1D+r9bWVs2ePVv333+/hg4d6rn/kUceUUJCguLi4rRv3z7NmzdP1dXV+vDDDyVJLpfLK04keW67XK42X6ukpEQvvPDC9U4VAAAEmOsOlIKCAh04cEBffPGF1/0zZszw/Dxs2DDFxsYqPT1dR48e1cCBA6/rtYqLi1VUVOS57Xa7FR8ff30TBwAAxruut3gKCwu1ceNGff755+rbt+9Vx6alpUmSjhw5IklyOByqr6/3GnPp9pWuW7HZbLLb7V4LAAAIXj4FimVZKiws1Nq1a7V161YlJiZe8zF79+6VJMXGxkqSnE6n9u/fr4aGBs+YzZs3y263Kzk52ZfpAACAIOXTWzwFBQUqKyvT+vXr1bNnT881IxEREeratauOHj2qsrIyjRs3Tr169dK+ffs0Z84cjR49WikpKZKkzMxMJScn69FHH9XSpUvlcrm0YMECFRQUyGazdfwWAgCAgOPTGZRly5apsbFRY8aMUWxsrGd5//33JUnh4eHasmWLMjMzlZSUpKeeekp5eXnasGGD5znCwsK0ceNGhYWFyel06ne/+50ee+wxr+9NAQAAtzefzqBYlnXV9fHx8aqoqLjm8yQkJOjjjz/25aUBAMBthL/FAwAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADCOT4FSUlKiESNGqGfPnoqOjtbEiRNVXV3tNaa5uVkFBQXq1auXevTooby8PNXX13uNqa2tVW5urrp166bo6GjNnTtXFy5cuPGtAQAAQcGnQKmoqFBBQYF27typzZs36/z588rMzFRTU5NnzJw5c7RhwwatWbNGFRUVOn78uCZNmuRZf/HiReXm5urcuXPasWOH3nnnHa1cuVILFy7suK0CAAABrZMvgzdt2uR1e+XKlYqOjlZVVZVGjx6txsZGvf322yorK9PYsWMlSStWrNCQIUO0c+dOjRo1Sp999pkOHTqkLVu2KCYmRvfcc49efPFFzZs3T88//7zCw8M7busAAEBAuqFrUBobGyVJUVFRkqSqqiqdP39eGRkZnjFJSUnq16+fKisrJUmVlZUaNmyYYmJiPGOysrLkdrt18ODBNl+npaVFbrfbawEAAMHrugOltbVVs2fP1v3336+hQ4dKklwul8LDwxUZGek1NiYmRi6XyzPm/8bJpfWX1rWlpKREERERniU+Pv56pw0AAALAdQdKQUGBDhw4oNWrV3fkfNpUXFysxsZGz3Ls2LGb/poAAMB/fLoG5ZLCwkJt3LhR27dvV9++fT33OxwOnTt3TqdOnfI6i1JfXy+Hw+EZs3v3bq/nu/Qpn0tjfspms8lms13PVAEAQADy6QyKZVkqLCzU2rVrtXXrViUmJnqtT01NVefOnVVeXu65r7q6WrW1tXI6nZIkp9Op/fv3q6GhwTNm8+bNstvtSk5OvpFtAQAAQcKnMygFBQUqKyvT+vXr1bNnT881IxEREeratasiIiI0bdo0FRUVKSoqSna7XU8++aScTqdGjRolScrMzFRycrIeffRRLV26VC6XSwsWLFBBQQFnSQAAgCQfA2XZsmWSpDFjxnjdv2LFCj3++OOSpFdffVWhoaHKy8tTS0uLsrKy9NZbb3nGhoWFaePGjZo1a5acTqe6d++u/Px8LV68+Ma2BAAABA2fAsWyrGuO6dKli0pLS1VaWnrFMQkJCfr44499eWkAAHAb4W/xAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjdPL3BHD9+s//6IrrvluSewtnAgBAx+IMCgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjONzoGzfvl3jx49XXFycQkJCtG7dOq/1jz/+uEJCQryW7OxsrzEnT57U1KlTZbfbFRkZqWnTpunMmTM3tCEAACB4+BwoTU1NGj58uEpLS684Jjs7W3V1dZ7lvffe81o/depUHTx4UJs3b9bGjRu1fft2zZgxw/fZAwCAoNTJ1wfk5OQoJyfnqmNsNpscDkeb67755htt2rRJX375pe677z5J0ptvvqlx48bp5ZdfVlxcnK9TAgAAQeamXIOybds2RUdHa/DgwZo1a5ZOnDjhWVdZWanIyEhPnEhSRkaGQkNDtWvXrjafr6WlRW6322sBAADBq8MDJTs7W++++67Ky8v1l7/8RRUVFcrJydHFixclSS6XS9HR0V6P6dSpk6KiouRyudp8zpKSEkVERHiW+Pj4jp42AAAwiM9v8VzLlClTPD8PGzZMKSkpGjhwoLZt26b09PTres7i4mIVFRV5brvdbiIFAIAgdtM/ZjxgwAD17t1bR44ckSQ5HA41NDR4jblw4YJOnjx5xetWbDab7Ha71wIAAILXTQ+U77//XidOnFBsbKwkyel06tSpU6qqqvKM2bp1q1pbW5WWlnazpwMAAAKAz2/xnDlzxnM2RJJqamq0d+9eRUVFKSoqSi+88ILy8vLkcDh09OhRPfPMMxo0aJCysrIkSUOGDFF2dramT5+u5cuX6/z58yosLNSUKVP4BA8AAJB0HWdQ9uzZo3vvvVf33nuvJKmoqEj33nuvFi5cqLCwMO3bt0+/+c1vdNddd2natGlKTU3VP//5T9lsNs9zrFq1SklJSUpPT9e4ceP0wAMP6K9//WvHbRUAAAhoPp9BGTNmjCzLuuL6Tz/99JrPERUVpbKyMl9fGgAA3Cb4WzwAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIzTyd8TwK3Xf/5HV1z33ZLcWzgTAADaxhkUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxfA6U7du3a/z48YqLi1NISIjWrVvntd6yLC1cuFCxsbHq2rWrMjIydPjwYa8xJ0+e1NSpU2W32xUZGalp06bpzJkzN7QhAAAgePgcKE1NTRo+fLhKS0vbXL906VK98cYbWr58uXbt2qXu3bsrKytLzc3NnjFTp07VwYMHtXnzZm3cuFHbt2/XjBkzrn8rAABAUOnk6wNycnKUk5PT5jrLsvTaa69pwYIFmjBhgiTp3XffVUxMjNatW6cpU6bom2++0aZNm/Tll1/qvvvukyS9+eabGjdunF5++WXFxcXdwOYAAIBg0KHXoNTU1MjlcikjI8NzX0REhNLS0lRZWSlJqqysVGRkpCdOJCkjI0OhoaHatWtXm8/b0tIit9vttQAAgODVoYHicrkkSTExMV73x8TEeNa5XC5FR0d7re/UqZOioqI8Y36qpKREERERniU+Pr4jpw0AAAwTEJ/iKS4uVmNjo2c5duyYv6cEAABuog4NFIfDIUmqr6/3ur++vt6zzuFwqKGhwWv9hQsXdPLkSc+Yn7LZbLLb7V4LAAAIXh0aKImJiXI4HCovL/fc53a7tWvXLjmdTkmS0+nUqVOnVFVV5RmzdetWtba2Ki0trSOnAwAAApTPn+I5c+aMjhw54rldU1OjvXv3KioqSv369dPs2bP1pz/9SXfeeacSExP13HPPKS4uThMnTpQkDRkyRNnZ2Zo+fbqWL1+u8+fPq7CwUFOmTOETPAAAQNJ1BMqePXv0y1/+0nO7qKhIkpSfn6+VK1fqmWeeUVNTk2bMmKFTp07pgQce0KZNm9SlSxfPY1atWqXCwkKlp6crNDRUeXl5euONNzpgcwAAQDDwOVDGjBkjy7KuuD4kJESLFy/W4sWLrzgmKipKZWVlvr40AAC4TQTEp3gAAMDthUABAADGIVAAAIBxCBQAAGAcny+Sxe2r//yPrrjuuyW5t3AmAIBgxxkUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxiFQAACAcQgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGKeTvyeA4Nd//kdXXPfdktxbOBMAQKDgDAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgdHijPP/+8QkJCvJakpCTP+ubmZhUUFKhXr17q0aOH8vLyVF9f39HTAAAAAeymnEG5++67VVdX51m++OILz7o5c+Zow4YNWrNmjSoqKnT8+HFNmjTpZkwDAAAEqE435Uk7dZLD4bjs/sbGRr399tsqKyvT2LFjJUkrVqzQkCFDtHPnTo0aNepmTAcAAASYm3IG5fDhw4qLi9OAAQM0depU1dbWSpKqqqp0/vx5ZWRkeMYmJSWpX79+qqysvOLztbS0yO12ey0AACB4dXigpKWlaeXKldq0aZOWLVummpoaPfjggzp9+rRcLpfCw8MVGRnp9ZiYmBi5XK4rPmdJSYkiIiI8S3x8fEdPGwAAGKTD3+LJycnx/JySkqK0tDQlJCTogw8+UNeuXa/rOYuLi1VUVOS57Xa7iRQAAILYTf+YcWRkpO666y4dOXJEDodD586d06lTp7zG1NfXt3nNyiU2m012u91rAQAAweumB8qZM2d09OhRxcbGKjU1VZ07d1Z5eblnfXV1tWpra+V0Om/2VAAAQIDo8Ld4nn76aY0fP14JCQk6fvy4Fi1apLCwMD388MOKiIjQtGnTVFRUpKioKNntdj355JNyOp18ggcAAHh0eKB8//33evjhh3XixAn16dNHDzzwgHbu3Kk+ffpIkl599VWFhoYqLy9PLS0tysrK0ltvvdXR0wAAAAGswwNl9erVV13fpUsXlZaWqrS0tKNfGgAABAn+Fg8AADAOgQIAAIxzU77qHugI/ed/dMV13y3JvYUzAQDcapxBAQAAxiFQAACAcXiLB7cV3jYCgMDAGRQAAGAcAgUAABiHQAEAAMYhUAAAgHG4SBYAcEtxsXrb2C/eCBQAuA3wyw+BhkABAKADEYMdg0ABACCABWsQESiAoYL1PzoA0B4ECtBO1xsMhMatczvs69thGwGJQAEAfukHCP6dbi8ECgBJ/McfgFn4ojYAAGAczqAAuCGceQFwMxAoQJAhGG4d9nVwux3+fU3eRt7iAQAAxiFQAACAcXiLBwBwRSa/BYDgxhkUAABgHAIFAAAYh7d4ABgn2N9WuNr2ScGxjcCN4gwKAAAwDoECAACMQ6AAAADjcA0KACCoBfs1TcGKMygAAMA4BAoAADAOgQIAAIxDoAAAAOMQKAAAwDgECgAAMA6BAgAAjEOgAAAA4xAoAADAOAQKAAAwDoECAACMQ6AAAADjECgAAMA4fg2U0tJS9e/fX126dFFaWpp2797tz+kAAABD+C1Q3n//fRUVFWnRokX66quvNHz4cGVlZamhocFfUwIAAIbwW6C88sormj59up544gklJydr+fLl6tatm/7+97/7a0oAAMAQnfzxoufOnVNVVZWKi4s994WGhiojI0OVlZWXjW9paVFLS4vndmNjoyTJ7XbflPm1tpy94rqrvSaPM+NxV3Mjzxko2x8oj7uaQNmGm/E4f7wm/7635+Nu9LHX49JzWpZ17cGWH/znP/+xJFk7duzwun/u3LnWyJEjLxu/aNEiSxILCwsLCwtLECzHjh27Ziv45QyKr4qLi1VUVOS53draqpMnT6pXr14KCQm5bLzb7VZ8fLyOHTsmu91+K6cacNhX7ce+aj/2Vfuxr9qPfeUbE/eXZVk6ffq04uLirjnWL4HSu3dvhYWFqb6+3uv++vp6ORyOy8bbbDbZbDav+yIjI6/5Ona73Zh/FNOxr9qPfdV+7Kv2Y1+1H/vKN6btr4iIiHaN88tFsuHh4UpNTVV5ebnnvtbWVpWXl8vpdPpjSgAAwCB+e4unqKhI+fn5uu+++zRy5Ei99tprampq0hNPPOGvKQEAAEP4LVAmT56sH374QQsXLpTL5dI999yjTZs2KSYm5oaf22azadGiRZe9LYTLsa/aj33Vfuyr9mNftR/7yjeBvr9CLKs9n/UBAAC4dfhbPAAAwDgECgAAMA6BAgAAjEOgAAAA4wRdoJSWlqp///7q0qWL0tLStHv3bn9PyTjPP/+8QkJCvJakpCR/T8sY27dv1/jx4xUXF6eQkBCtW7fOa71lWVq4cKFiY2PVtWtXZWRk6PDhw/6ZrJ9da189/vjjlx1r2dnZ/pmsH5WUlGjEiBHq2bOnoqOjNXHiRFVXV3uNaW5uVkFBgXr16qUePXooLy/vsi+zvF20Z3+NGTPmsmNr5syZfpqx/yxbtkwpKSmeL2NzOp365JNPPOsD+bgKqkB5//33VVRUpEWLFumrr77S8OHDlZWVpYaGBn9PzTh333236urqPMsXX3zh7ykZo6mpScOHD1dpaWmb65cuXao33nhDy5cv165du9S9e3dlZWWpubn5Fs/U/661ryQpOzvb61h77733buEMzVBRUaGCggLt3LlTmzdv1vnz55WZmammpibPmDlz5mjDhg1as2aNKioqdPz4cU2aNMmPs/af9uwvSZo+fbrXsbV06VI/zdh/+vbtqyVLlqiqqkp79uzR2LFjNWHCBB08eFBSgB9XHfLX/wwxcuRIq6CgwHP74sWLVlxcnFVSUuLHWZln0aJF1vDhw/09jYAgyVq7dq3ndmtrq+VwOKyXXnrJc9+pU6csm81mvffee36YoTl+uq8sy7Ly8/OtCRMm+GU+JmtoaLAkWRUVFZZl/e8Y6ty5s7VmzRrPmG+++caSZFVWVvprmsb46f6yLMv6xS9+Yf3xj3/036QMdscdd1h/+9vfAv64CpozKOfOnVNVVZUyMjI894WGhiojI0OVlZV+nJmZDh8+rLi4OA0YMEBTp05VbW2tv6cUEGpqauRyubyOs4iICKWlpXGcXcG2bdsUHR2twYMHa9asWTpx4oS/p+R3jY2NkqSoqChJUlVVlc6fP+91XCUlJalfv34cV7p8f12yatUq9e7dW0OHDlVxcbHOnj3rj+kZ4+LFi1q9erWamprkdDoD/rgKiL9m3B7//e9/dfHixcu+iTYmJkbffvutn2ZlprS0NK1cuVKDBw9WXV2dXnjhBT344IM6cOCAevbs6e/pGc3lcklSm8fZpXX4/7KzszVp0iQlJibq6NGjevbZZ5WTk6PKykqFhYX5e3p+0draqtmzZ+v+++/X0KFDJf3vuAoPD7/sj6ByXLW9vyTpkUceUUJCguLi4rRv3z7NmzdP1dXV+vDDD/04W//Yv3+/nE6nmpub1aNHD61du1bJycnau3dvQB9XQRMoaL+cnBzPzykpKUpLS1NCQoI++OADTZs2zY8zQ7CZMmWK5+dhw4YpJSVFAwcO1LZt25Senu7HmflPQUGBDhw4wHVf7XSl/TVjxgzPz8OGDVNsbKzS09N19OhRDRw48FZP068GDx6svXv3qrGxUf/4xz+Un5+viooKf0/rhgXNWzy9e/dWWFjYZVcn19fXy+Fw+GlWgSEyMlJ33XWXjhw54u+pGO/SscRxdn0GDBig3r1737bHWmFhoTZu3KjPP/9cffv29dzvcDh07tw5nTp1ymv87X5cXWl/tSUtLU2SbstjKzw8XIMGDVJqaqpKSko0fPhwvf766wF/XAVNoISHhys1NVXl5eWe+1pbW1VeXi6n0+nHmZnvzJkzOnr0qGJjY/09FeMlJibK4XB4HWdut1u7du3iOGuH77//XidOnLjtjjXLslRYWKi1a9dq69atSkxM9Fqfmpqqzp07ex1X1dXVqq2tvS2Pq2vtr7bs3btXkm67Y6stra2tamlpCfzjyt9X6Xak1atXWzabzVq5cqV16NAha8aMGVZkZKTlcrn8PTWjPPXUU9a2bdusmpoa61//+peVkZFh9e7d22poaPD31Ixw+vRp6+uvv7a+/vprS5L1yiuvWF9//bX173//27Isy1qyZIkVGRlprV+/3tq3b581YcIEKzEx0frxxx/9PPNb72r76vTp09bTTz9tVVZWWjU1NdaWLVusn//859add95pNTc3+3vqt9SsWbOsiIgIa9u2bVZdXZ1nOXv2rGfMzJkzrX79+llbt2619uzZYzmdTsvpdPpx1v5zrf115MgRa/HixdaePXusmpoaa/369daAAQOs0aNH+3nmt978+fOtiooKq6amxtq3b581f/58KyQkxPrss88sywrs4yqoAsWyLOvNN9+0+vXrZ4WHh1sjR460du7c6e8pGWfy5MlWbGysFR4ebv3sZz+zJk+ebB05csTf0zLG559/bkm6bMnPz7cs638fNX7uueesmJgYy2azWenp6VZ1dbV/J+0nV9tXZ8+etTIzM60+ffpYnTt3thISEqzp06fflv/D0NY+kmStWLHCM+bHH3+0/vCHP1h33HGH1a1bN+u3v/2tVVdX579J+9G19ldtba01evRoKyoqyrLZbNagQYOsuXPnWo2Njf6duB/8/ve/txISEqzw8HCrT58+Vnp6uidOLCuwj6sQy7KsW3e+BgAA4NqC5hoUAAAQPAgUAABgHAIFAAAYh0ABAADGIVAAAIBxCBQAAGAcAgUAABiHQAEAAMYhUAAAgHEIFAAAYBwCBQAAGIdAAQAAxvl/F3uXZdNo9IkAAAAASUVORK5CYII=",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
@@ -280,40 +322,40 @@
},
{
"cell_type": "code",
- "execution_count": 90,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(array([1714., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 305., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+ "(array([1289., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
+ " 0., 477., 0., 0., 0., 0., 0., 0., 0.,\n",
+ " 0., 0., 0., 192., 0., 0., 0., 0., 0.,\n",
+ " 0., 0., 0., 0., 0., 55., 0., 0., 0.,\n",
+ " 0., 0., 0., 0., 0., 0., 28., 0., 0.,\n",
+ " 0., 0., 0., 0., 0., 0., 0., 0., 5.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
- " 29.]),\n",
- " array([1. , 1.03125, 1.0625 , 1.09375, 1.125 , 1.15625, 1.1875 ,\n",
- " 1.21875, 1.25 , 1.28125, 1.3125 , 1.34375, 1.375 , 1.40625,\n",
- " 1.4375 , 1.46875, 1.5 , 1.53125, 1.5625 , 1.59375, 1.625 ,\n",
- " 1.65625, 1.6875 , 1.71875, 1.75 , 1.78125, 1.8125 , 1.84375,\n",
- " 1.875 , 1.90625, 1.9375 , 1.96875, 2. , 2.03125, 2.0625 ,\n",
- " 2.09375, 2.125 , 2.15625, 2.1875 , 2.21875, 2.25 , 2.28125,\n",
- " 2.3125 , 2.34375, 2.375 , 2.40625, 2.4375 , 2.46875, 2.5 ,\n",
- " 2.53125, 2.5625 , 2.59375, 2.625 , 2.65625, 2.6875 , 2.71875,\n",
- " 2.75 , 2.78125, 2.8125 , 2.84375, 2.875 , 2.90625, 2.9375 ,\n",
- " 2.96875, 3. ]),\n",
+ " 2.]),\n",
+ " array([1. , 1.09375, 1.1875 , 1.28125, 1.375 , 1.46875, 1.5625 ,\n",
+ " 1.65625, 1.75 , 1.84375, 1.9375 , 2.03125, 2.125 , 2.21875,\n",
+ " 2.3125 , 2.40625, 2.5 , 2.59375, 2.6875 , 2.78125, 2.875 ,\n",
+ " 2.96875, 3.0625 , 3.15625, 3.25 , 3.34375, 3.4375 , 3.53125,\n",
+ " 3.625 , 3.71875, 3.8125 , 3.90625, 4. , 4.09375, 4.1875 ,\n",
+ " 4.28125, 4.375 , 4.46875, 4.5625 , 4.65625, 4.75 , 4.84375,\n",
+ " 4.9375 , 5.03125, 5.125 , 5.21875, 5.3125 , 5.40625, 5.5 ,\n",
+ " 5.59375, 5.6875 , 5.78125, 5.875 , 5.96875, 6.0625 , 6.15625,\n",
+ " 6.25 , 6.34375, 6.4375 , 6.53125, 6.625 , 6.71875, 6.8125 ,\n",
+ " 6.90625, 7. ]),\n",
" <BarContainer object of 64 artists>)"
]
},
- "execution_count": 90,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
- "image/png": "",
+ "image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
@@ -328,21 +370,21 @@
},
{
"cell_type": "code",
- "execution_count": 97,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
- "import pickle\n",
+ "# import pickle\n",
"\n",
- "graphs, labels, _ = mkbatch(3*10**5)\n",
+ "# graphs, labels, _ = mkbatch(3*10**5)\n",
"\n",
- "data = {\n",
- " \"data\": graphs,\n",
- " \"labels\": labels\n",
- "}\n",
+ "# data = {\n",
+ "# \"data\": graphs,\n",
+ "# \"labels\": labels\n",
+ "# }\n",
"\n",
- "with open('data.pkl', 'wb') as file:\n",
- " pickle.dump(data, file)"
+ "# with open('data.pkl', 'wb') as file:\n",
+ "# pickle.dump(data, file)"
]
},
{
@@ -356,43 +398,34 @@
},
{
"cell_type": "code",
- "execution_count": 135,
+ "execution_count": 10,
"metadata": {
"id": "tLOWhg_CeWzH"
},
"outputs": [],
"source": [
"class TransformerModel(nn.Module):\n",
- " def __init__(self, input_dim, model_dim, output_dim, num_heads, num_layers, seq_len, device, dropout):\n",
+ " def __init__(self, input_dim, model_dim, output_dim, num_heads, num_layers, seq_len, dropout):\n",
" super().__init__()\n",
- " self.embedding = nn.Embedding(input_dim, model_dim//2, dtype=torch.bfloat16)\n",
- " # seq_len is odd\n",
- " self.fancy_encoding = torch.repeat_interleave(torch.rand((1, seq_len // 2 + 1, model_dim // 2), device=device, dtype=torch.bfloat16), 2, dim=1)\n",
- " # cut off last element since the target vertex is not repeated\n",
- " self.fancy_encoding = self.fancy_encoding[:, :seq_len, :]\n",
- " \n",
" self.model_dim = model_dim\n",
- " self.seq_len = seq_len\n",
- " self.device = device\n",
- "\n",
+ " self.embedding = nn.Embedding(input_dim, model_dim // 2, dtype=torch.bfloat16)\n",
+ " # # seq_len is odd\n",
+ " # self.fancy_encoding = torch.repeat_interleave(torch.rand((1, seq_len // 2 + 1, model_dim // 2), device=device, dtype=torch.bfloat16), 2, dim=1)\n",
+ " # # cut off last element since the target vertex is not repeated\n",
+ " # self.fancy_encoding = self.fancy_encoding[:, :seq_len, :]\n",
" encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=num_heads,\n",
" dim_feedforward=model_dim*4,\n",
" dropout=dropout, batch_first=True, dtype=torch.bfloat16)\n",
" self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)\n",
+ " self.fc_out = nn.Linear(model_dim, output_dim, dtype=torch.bfloat16)\n",
"\n",
- " self.fc_out = nn.Linear(model_dim*seq_len, output_dim, dtype=torch.bfloat16)\n",
- "\n",
- " def full_embedding(self, src):\n",
- " batch_size, src_len = src.size(0), src.size(1)\n",
- " return torch.cat((self.embedding(src) * sqrt(self.model_dim), self.fancy_encoding.repeat((batch_size, 1, 1))), dim=2)\n",
- " \n",
" def forward(self, src, key_padding_mask):\n",
- " embed = self.full_embedding(src)\n",
- " output = self.transformer_encoder(embed, src_key_padding_mask=key_padding_mask)\n",
- " output[key_padding_mask] = 0 # Hack to stop no_grad problem\n",
- " flat_output = torch.flatten(output, start_dim=1, end_dim=2)\n",
- " output = self.fc_out(flat_output)\n",
- " return output"
+ " batch_sz = src.size(0)\n",
+ " embed = torch.cat((self.embedding(src[:,:-1:2]), self.embedding(src[:,1::2])), dim=2)\n",
+ " last_dude = torch.cat((self.embedding(src[:,-1:]), torch.ones((batch_sz, 1, self.model_dim // 2), dtype=torch.bfloat16, device=device)), dim=2)\n",
+ " final_embed = torch.cat((embed, last_dude), dim=1)\n",
+ " output = self.transformer_encoder(final_embed, src_key_padding_mask=key_padding_mask[:, ::2])\n",
+ " return self.fc_out(output[:, -1, :])"
]
},
{
@@ -406,7 +439,7 @@
},
{
"cell_type": "code",
- "execution_count": 136,
+ "execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
@@ -419,8 +452,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Training data: 1049M\n",
- "Trainable parameters in the model: 50K\n"
+ "Training data: 524M\n",
+ "Trainable parameters in the model: 800K\n"
]
}
],
@@ -429,17 +462,17 @@
"VOCAB_SIZE = 1 + MAX_VTXS # one more than the max number of vertices\n",
"MODEL_DIM = 64 # Dimension of model (embedding and transformer)\n",
"NEPOCHS = 1000\n",
- "BSZ = 8196 * 4 # Batch size\n",
+ "BSZ = 2**14 # Batch size\n",
"BPE = 32 # Batches per epoch\n",
- "LR = 5e-3\n",
- "WD = 2e-3\n",
- "NHEADS = 1 #4\n",
- "NLAYERS = 1 #16\n",
- "DROPOUT = 0.2\n",
+ "LR = 5e-5\n",
+ "WD = 1e-5\n",
+ "NHEADS = 4\n",
+ "NLAYERS = 16\n",
+ "DROPOUT = 0 # 0.2\n",
"model = TransformerModel(input_dim=VOCAB_SIZE, model_dim=MODEL_DIM,\n",
" output_dim=1, num_heads=NHEADS,\n",
" num_layers=NLAYERS, seq_len=SEQ_LEN,\n",
- " dropout=DROPOUT, device=device).to(device)\n",
+ " dropout=DROPOUT).to(device)\n",
"# model = torch.compile(model)\n",
"\n",
"criterion = nn.MSELoss()\n",
@@ -452,21 +485,38 @@
},
{
"cell_type": "code",
- "execution_count": 137,
+ "execution_count": 121,
"metadata": {},
"outputs": [],
"source": [
- "from torch.utils.data import DataLoader, TensorDataset\n",
+ "# from torch.utils.data import DataLoader, TensorDataset\n",
"\n",
- "with open(\"data.pkl\", \"rb\") as f:\n",
- " pickled_stuff = pickle.load(f)\n",
+ "# with open(\"data.pkl\", \"rb\") as f:\n",
+ "# pickled_stuff = pickle.load(f)\n",
"\n",
- "data = pickled_stuff[\"data\"].to(device)\n",
- "label = pickled_stuff[\"labels\"].to(device)\n",
- "padding_mask = (data == PAD_TOKEN).bool().to(device)\n",
- "dataset = TensorDataset(data, label, padding_mask)\n",
- "# train_dataset, test_dataset = torch.utils.data.random_split(dataset, [.9, .1])\n",
- "train_loader = DataLoader(dataset, batch_size=BSZ, shuffle=True)"
+ "# data = pickled_stuff[\"data\"].to(device)\n",
+ "# label = pickled_stuff[\"labels\"].to(device)\n",
+ "# padding_mask = (data == PAD_TOKEN).bool().to(device)\n",
+ "# dataset = TensorDataset(data, label, padding_mask)\n",
+ "# # train_dataset, test_dataset = torch.utils.data.random_split(dataset, [.9, .1])\n",
+ "# train_loader = DataLoader(dataset, batch_size=BSZ, shuffle=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# figure out if it's doing better on short paths\n",
+ "def evaluate_short():\n",
+ " model.eval()\n",
+ " test_loss = 0\n",
+ " with torch.no_grad():\n",
+ " batch_src, batch_labels, batch_padding_mask = mkbatch(BSZ)\n",
+ " output = model(batch_src, batch_padding_mask)\n",
+ " loss = criterion(output[batch_labels == 1].squeeze(1), batch_labels[batch_labels==1])\n",
+ " return loss.item()"
]
},
{
@@ -480,7 +530,7 @@
},
{
"cell_type": "code",
- "execution_count": 138,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -496,7 +546,7 @@
},
{
"cell_type": "code",
- "execution_count": 139,
+ "execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
@@ -510,580 +560,119 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.01it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 1/1000 \t Train Err: 0.5381 \t Test Err: 0.1865\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.75it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 2/1000 \t Train Err: 0.1227 \t Test Err: 0.1128\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.03it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 3/1000 \t Train Err: 0.1071 \t Test Err: 0.1118\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.01it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 4/1000 \t Train Err: 0.1008 \t Test Err: 0.1035\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.75it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 5/1000 \t Train Err: 0.0972 \t Test Err: 0.1021\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.03it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 6/1000 \t Train Err: 0.0949 \t Test Err: 0.0981\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.76it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 7/1000 \t Train Err: 0.0929 \t Test Err: 0.1021\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.00it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 8/1000 \t Train Err: 0.0908 \t Test Err: 0.0977\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.01it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 9/1000 \t Train Err: 0.0886 \t Test Err: 0.0952\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.72it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 10/1000 \t Train Err: 0.0910 \t Test Err: 0.0962\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 11/1000 \t Train Err: 0.0851 \t Test Err: 0.0898\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.03it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 12/1000 \t Train Err: 0.0849 \t Test Err: 0.0864\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.74it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 13/1000 \t Train Err: 0.0795 \t Test Err: 0.0684\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 14/1000 \t Train Err: 0.0691 \t Test Err: 0.0293\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.74it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 15/1000 \t Train Err: 0.0455 \t Test Err: 0.0271\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 6.99it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 16/1000 \t Train Err: 0.0421 \t Test Err: 0.0210\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 6.99it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 17/1000 \t Train Err: 0.0817 \t Test Err: 0.0505\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.75it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 18/1000 \t Train Err: 0.0456 \t Test Err: 0.0176\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.03it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 19/1000 \t Train Err: 0.0370 \t Test Err: 0.0165\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.03it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 20/1000 \t Train Err: 0.0374 \t Test Err: 0.0205\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.73it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 21/1000 \t Train Err: 0.0372 \t Test Err: 0.0142\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.01it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 22/1000 \t Train Err: 0.0343 \t Test Err: 0.0132\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.75it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 23/1000 \t Train Err: 0.0337 \t Test Err: 0.0119\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.03it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 24/1000 \t Train Err: 0.0713 \t Test Err: 0.0259\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.04it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 25/1000 \t Train Err: 0.0522 \t Test Err: 0.0143\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.75it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 26/1000 \t Train Err: 0.0342 \t Test Err: 0.0117\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 27/1000 \t Train Err: 0.0864 \t Test Err: 0.0728\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.04it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 28/1000 \t Train Err: 0.0701 \t Test Err: 0.0510\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.72it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 29/1000 \t Train Err: 0.0598 \t Test Err: 0.0369\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 6.99it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 30/1000 \t Train Err: 0.0462 \t Test Err: 0.0231\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.73it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 31/1000 \t Train Err: 0.0387 \t Test Err: 0.0181\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.30it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 32/1000 \t Train Err: 0.0351 \t Test Err: 0.0142\n"
+ "Epoch 1/1000 \t Train Err: 149.6562 \t Test Err: 137.0000, Test short loss: 0.3164\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.04it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.41it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 33/1000 \t Train Err: 0.0337 \t Test Err: 0.0123\n"
+ "Epoch 2/1000 \t Train Err: 129.4688 \t Test Err: 120.0000, Test short loss: 0.4512\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.74it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.39it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 34/1000 \t Train Err: 0.0331 \t Test Err: 0.0117\n"
+ "Epoch 3/1000 \t Train Err: 116.1719 \t Test Err: 110.5000, Test short loss: 2.2500\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.38it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 35/1000 \t Train Err: 0.0329 \t Test Err: 0.0119\n"
+ "Epoch 4/1000 \t Train Err: 107.9375 \t Test Err: 104.0000, Test short loss: 4.0625\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 6.98it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.37it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 36/1000 \t Train Err: 0.0327 \t Test Err: 0.0109\n"
+ "Epoch 5/1000 \t Train Err: 103.0938 \t Test Err: 102.0000, Test short loss: 5.5625\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.74it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.38it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 37/1000 \t Train Err: 0.0319 \t Test Err: 0.0101\n"
+ "Epoch 6/1000 \t Train Err: 99.9531 \t Test Err: 99.0000, Test short loss: 6.6250\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.02it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.41it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 38/1000 \t Train Err: 0.0317 \t Test Err: 0.0099\n"
+ "Epoch 7/1000 \t Train Err: 99.0469 \t Test Err: 101.0000, Test short loss: 7.2188\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.71it/s]\n"
+ "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00, 2.39it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Epoch 39/1000 \t Train Err: 0.0313 \t Test Err: 0.0095\n"
+ "Epoch 8/1000 \t Train Err: 97.8594 \t Test Err: 97.5000, Test short loss: 7.6250\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:04<00:00, 7.01it/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Epoch 40/1000 \t Train Err: 0.0310 \t Test Err: 0.0097\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " 28%|██████████████████████████████████████████▊ | 9/32 [00:01<00:03, 6.50it/s]\n"
- ]
- },
- {
- "ename": "KeyboardInterrupt",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[139], line 11\u001b[0m\n\u001b[1;32m 9\u001b[0m train_loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m tqdm(\u001b[38;5;28mrange\u001b[39m(BPE)):\n\u001b[0;32m---> 11\u001b[0m batch_src, batch_labels, batch_padding_mask \u001b[38;5;241m=\u001b[39m \u001b[43mmkbatch\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBSZ\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# for batch_src, batch_labels, batch_padding_mask in tqdm(train_loader):\u001b[39;00m\n\u001b[1;32m 13\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n",
- "Cell \u001b[0;32mIn[76], line 55\u001b[0m, in \u001b[0;36mmkbatch\u001b[0;34m(size)\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(size):\n\u001b[1;32m 54\u001b[0m n \u001b[38;5;241m=\u001b[39m random\u001b[38;5;241m.\u001b[39mrandint(MIN_VTXS, MAX_VTXS)\n\u001b[0;32m---> 55\u001b[0m edge_list, adj_list \u001b[38;5;241m=\u001b[39m \u001b[43mrandom_graph\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 56\u001b[0m dist \u001b[38;5;241m=\u001b[39m SSSP(n, adj_list)\n\u001b[1;32m 57\u001b[0m edge_list[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;66;03m# target token\u001b[39;00m\n",
- "Cell \u001b[0;32mIn[76], line 15\u001b[0m, in \u001b[0;36mrandom_graph\u001b[0;34m(n)\u001b[0m\n\u001b[1;32m 13\u001b[0m adjacencies \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28mset\u001b[39m() \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(n\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m)]\n\u001b[1;32m 14\u001b[0m indices \u001b[38;5;241m=\u001b[39m [random\u001b[38;5;241m.\u001b[39mrandint(\u001b[38;5;241m1\u001b[39m, n) \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(AVG_DEG \u001b[38;5;241m*\u001b[39m (n\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m))]\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;43mrange\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mindices\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 16\u001b[0m u \u001b[38;5;241m=\u001b[39m indices[i]\n\u001b[1;32m 17\u001b[0m v \u001b[38;5;241m=\u001b[39m indices[i \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m1\u001b[39m]\n",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+ " 44%|██████████████████████████████████████████████████████████████████ | 14/32 [00:05<00:07, 2.41it/s]"
]
}
],
@@ -1108,12 +697,13 @@
" optimizer.step()\n",
"\n",
" test_loss = evaluate()\n",
+ " test_short_loss = evaluate_short()\n",
" \n",
" test_err.append(test_loss)\n",
" train_err.append(train_loss)\n",
" with open('loss', 'a') as f:\n",
" f.write(f\"{train_loss} {test_loss}\\n\")\n",
- " print(f\"Epoch {epoch + 1}/{NEPOCHS} \\t Train Err: {train_loss:.4f} \\t Test Err: {test_loss:.4f}\")\n",
+ " print(f\"Epoch {epoch + 1}/{NEPOCHS} \\t Train Err: {train_loss:.4f} \\t Test Err: {test_loss:.4f}, Test short loss: {test_short_loss:.4f}\")\n",
" \n",
" if epoch % 100 == 99:\n",
" torch.save(model.state_dict(), f\"model_weights_{epoch}.pth\")"
@@ -1121,55 +711,38 @@
},
{
"cell_type": "code",
- "execution_count": 163,
+ "execution_count": 125,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(torch.Size([1, 7, 64]), torch.Size([64, 64]))"
- ]
- },
- "execution_count": 163,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "\"\"\"\n",
- "Now let's figure out what it's doing. \n",
+ "# \"\"\"\n",
+ "# Now let's figure out what it's doing. \n",
"\n",
- "step 1: figure out what people are attending to \n",
- "\"\"\"\n",
+ "# step 1: figure out what people are attending to \n",
+ "# \"\"\"\n",
+ "\n",
+ "# example_graph, answer, padding = mkbatch(1)\n",
+ "# sentance_embeddings = model.full_embedding(example_graph)[0,:,:][example_graph.flatten() != 0]\n",
+ "# WQ,WK,WV = torch.split(model.transformer_encoder.layers[0].self_attn.in_proj_weight, (MODEL_DIM, MODEL_DIM, MODEL_DIM))\n",
"\n",
- "example_graph, answer, padding = mkbatch(1)\n",
- "sentance_embeddings = model.full_embedding(example_graph)\n",
- "Q,K,V = torch.split(model.transformer_encoder.layers[0].self_attn.in_proj_weight, (MODEL_DIM, MODEL_DIM, MODEL_DIM))\n",
+ "# Q = sentance_embeddings@WQ\n",
+ "# K = sentance_embeddings@WK\n",
"\n",
- "sentance_embeddings.shape, Q.shape\n",
- "sentance_embeddings@Q.T\n",
+ "# raw_scores = Q @ K.T / sqrt(MODEL_DIM)\n",
+ "# soft = torch.softmax(raw_scores, dim=-1).detach().cpu().to(float).numpy()\n",
+ "# plt.imshow(soft)\n",
+ "# plt.show()\n",
"\n",
- "# (sentance_embeddings @ Q).shape\n",
- "# sentance_embeddings.shape\n",
- "# K @ sentance_embeddings"
+ "# print(example_graph)\n",
+ "\n",
+ "# print(Q)"
]
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "<matplotlib.legend.Legend at 0x702d2d2eed20>"
- ]
- },
- "execution_count": 34,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"plt.suptitle('MSE vs Epochs')\n",
"plt.plot(train_err, label='Train', color='blue')\n",
@@ -1182,83 +755,17 @@
},
{
"cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "24.625"
- ]
- },
- "execution_count": 35,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "evaluate()"
- ]
- },
- {
- "cell_type": "code",
"execution_count": null,
"metadata": {
"id": "LoGEmM5lH7_A"
},
"outputs": [],
"source": [
- "batch_src, batch_labels, batch_padding_mask = next(iter(train_loader))\n",
+ "batch_src, batch_labels, batch_padding_mask = mkbatch(4096)\n",
"output = model(batch_src, batch_padding_mask)\n",
- "batch_src[0], batch_labels[0], output[0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.hist(output.detach().cpu().numpy().flatten(),bins=32)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.hist(label.detach().cpu().numpy().flatten(),bins=32)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.scatter(batch_labels.detach().cpu().numpy().flatten(),output.detach().cpu().numpy().flatten())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "batch_src2, batch_labels2, batch_padding_mask2 = next(iter(test_loader))\n",
- "output2 = model(batch_src2, batch_padding_mask2)\n",
- "loss = criterion(output2.squeeze(1), batch_labels2)\n",
- "batch_src2[0], batch_labels2[0], output2[0], loss"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.scatter(batch_labels2.detach().cpu().numpy().flatten(),output2.detach().cpu().numpy().flatten())"
+ "batch_src[0], batch_labels[0], output[0]\n",
+ "plt.scatter(batch_labels.detach().to(torch.float16).cpu().numpy().flatten(), output.detach().to(torch.float16).cpu().numpy().flatten())\n",
+ "plt.show()"
]
},
{
@@ -1277,8 +784,8 @@
"outputs": [],
"source": [
"N_TUNE_EPOCHS = 100\n",
- "TUNE_LR = 0.003\n",
- "TUNE_WD = 0.002\n",
+ "TUNE_LR = 1e-5\n",
+ "TUNE_WD = 1e-5\n",
"\n",
"tune_criterion = nn.MSELoss()\n",
"tune_optimizer = torch.optim.Adam(model.parameters(), lr=TUNE_LR, weight_decay=TUNE_WD)"
@@ -1306,32 +813,6 @@
"metadata": {},
"outputs": [],
"source": [
- "# This has to be in a separate cell for some weird event loop reasons\n",
- "%matplotlib widget\n",
- "fig,ax = plt.subplots()\n",
- "fig.suptitle('MSE vs Epochs')\n",
- "plt.show()\n",
- "%matplotlib inline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "ename": "NameError",
- "evalue": "name 'N_TUNE_EPOCHS' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[13], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m tune_train_err \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 2\u001b[0m tune_test_err \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m epoch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[43mN_TUNE_EPOCHS\u001b[49m):\n\u001b[1;32m 5\u001b[0m model\u001b[38;5;241m.\u001b[39mtrain()\n\u001b[1;32m 6\u001b[0m train_loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
- "\u001b[0;31mNameError\u001b[0m: name 'N_TUNE_EPOCHS' is not defined"
- ]
- }
- ],
- "source": [
"tune_train_err = []\n",
"tune_test_err = []\n",
"\n",
@@ -1346,7 +827,7 @@
" optimizer.zero_grad()\n",
" output = model(batch_src, batch_padding_mask)\n",
" loss = criterion(output.squeeze(1), batch_labels)\n",
- " train_loss += loss.item()/BPE\n",
+ " train_loss += loss.item() / BPE\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
@@ -1356,11 +837,6 @@
" tune_train_err.append(train_loss)\n",
" with open('tune_loss', 'a') as f:\n",
" f.write(f\"{train_loss} {test_loss}\\n\")\n",
- " ax.plot(tune_train_err, label='Train', color='blue')\n",
- " ax.plot(tune_test_err, label='Test', color='red')\n",
- " ax.set_xlabel('Epochs')\n",
- " ax.set_ylabel('MSE')\n",
- " fig.canvas.draw()\n",
" print(f\"Epoch {epoch + 1}/{NEPOCHS} \\t Train Err: {train_loss:.4f} \\t Test Err: {test_loss:.4f}\")\n",
"\n",
" if epoch % 10 == 9:\n",
@@ -1368,6 +844,33 @@
]
},
{
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.suptitle('MSE vs Epochs')\n",
+ "plt.plot(tune_train_err, label='Train', color='blue')\n",
+ "plt.plot(tune_test_err, label='Test', color='red')\n",
+ "plt.xlabel('Epochs')\n",
+ "plt.ylabel('MSE')\n",
+ "plt.legend()\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "batch_src, batch_labels, batch_padding_mask = mktunebatch(2048)\n",
+ "output = model(batch_src, batch_padding_mask)\n",
+ "batch_src[0], batch_labels[0], output[0]\n",
+ "plt.scatter(batch_labels.detach().to(torch.float16).cpu().numpy().flatten(), output.detach().to(torch.float16).cpu().numpy().flatten())"
+ ]
+ },
+ {
"cell_type": "markdown",
"metadata": {
"id": "JtTLXn4zC1z_"