From 24a41520c094a15c56bdc1f0320db52bb3c0a7f7 Mon Sep 17 00:00:00 2001 From: Kevin Zhao Date: Wed, 6 Nov 2024 18:41:24 -0500 Subject: Blah --- Right_to_Left_NLP.ipynb | 373 ----------- causal_v2.ipynb | 1223 ------------------------------------- finetune_bert.py | 663 ++++++++++++++++++++ notebooks/Right_to_Left_NLP.ipynb | 373 +++++++++++ notebooks/causal_v2.ipynb | 1223 +++++++++++++++++++++++++++++++++++++ notebooks/rtl.ipynb | 191 ++++++ official_run_clm.py | 657 ++++++++++++++++++++ requirements.txt | 4 + rtl.ipynb | 191 ------ utils.py | 45 ++ 10 files changed, 3156 insertions(+), 1787 deletions(-) delete mode 100644 Right_to_Left_NLP.ipynb delete mode 100644 causal_v2.ipynb create mode 100644 finetune_bert.py create mode 100644 notebooks/Right_to_Left_NLP.ipynb create mode 100644 notebooks/causal_v2.ipynb create mode 100644 notebooks/rtl.ipynb create mode 100644 official_run_clm.py create mode 100644 requirements.txt delete mode 100644 rtl.ipynb create mode 100644 utils.py diff --git a/Right_to_Left_NLP.ipynb b/Right_to_Left_NLP.ipynb deleted file mode 100644 index bcbc57a..0000000 --- a/Right_to_Left_NLP.ipynb +++ /dev/null @@ -1,373 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "code", - "source": [ - "%pip install datasets torch transformers" - ], - "metadata": { - "collapsed": true, - "id": "M29-oTOBIiMr" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2iJJyERxHWSO", - "outputId": "04e9bc9d-5ee9-48d5-f370-6fd66ec7b7c1" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "cpu\n" - ] - } - ], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import transformers\n", - "from datasets import load_dataset\n", - "\n", - "transformers.set_seed(42)\n", - "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", - "print(device)" - ] - }, - { - "cell_type": "code", - "source": [ - "model_name_or_path = \"bert-base-uncased\"\n", - "model = transformers.AutoModelForMaskedLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation=\"sdpa\")\n", - "model.eval()\n", - "\n", - "tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "C5PdaHGWHuXG", - "outputId": "d15272a5-1ce1-4c7e-9004-fc686a3de6b9" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n", - "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", - "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", - " warnings.warn(\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "def ltr_mask(seq_len: int) -> torch.Tensor:\n", - " mask = torch.ones((seq_len, seq_len), dtype=bool)\n", - " return torch.tril(mask, diagonal=-1)\n", - "\n", - "def rtl_mask(seq_len: int) -> torch.Tensor:\n", - " return ltr_mask(seq_len).T" - ], - "metadata": { - "id": "H_AUjBRoJHXU" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "model.register_buffer(\"attn_mask\", rtl_mask(model.config.max_position_embeddings).to(model.device))\n", - "\n", - "def attn_hook(attn_module: nn.Module, args: tuple, kwargs: dict):\n", - " \"\"\"\n", - " Assuming https://github.com/huggingface/transformers/blob/33868a057c02f0368ba63bd1edb746be38fe3d90/src/transformers/models/bert/modeling_bert.py#L515\n", - " so no `kwargs` and `attention_mask` is second positional arg.\n", - "\n", - " Uses global `model.attn_mask` to save memory.\n", - " \"\"\"\n", - " assert not kwargs\n", - "\n", - " args = list(args)\n", - " assert args[1].size()[-2:] == model.attn_mask.size(), f\"{args[1].size()=} {model.attn_mask.size()=}\"\n", - " args[1] = model.attn_mask\n", - " return tuple(args), kwargs\n", - "\n", - "def debug_inputs_hook(attn_module: nn.Module, args: tuple, output):\n", - " print(f\"Post-forward checks\")\n", - " assert torch.equal(args[1], model.attn_mask), (args[1], model.attn_mask)" - ], - "metadata": { - "id": "Oy27MZcLLLsD" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# attn_mask = rtl_mask(model.config.max_position_embeddings)\n", - "for name, module in model.named_modules():\n", - " if isinstance(module, transformers.models.bert.modeling_bert.BertSelfAttention):\n", - " module._forward_hooks.clear() # running multiple times right now during testing\n", - " module.register_forward_pre_hook(attn_hook, with_kwargs=True)\n", - " module.register_forward_hook(debug_inputs_hook)\n", - " # module.register_buffer(\"attn_mask\", attn_mask)\n", - "\n", - "model = model.to(device)" - ], - "metadata": { - "id": "anEdwKj_OWWy" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "ds = load_dataset(\"Salesforce/wikitext\", \"wikitext-103-v1\")" - ], - "metadata": { - "id": "P1BEQFsLIRfX" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "train_ds = ds[\"train\"]\n", - "inputs = tokenizer(train_ds[5][\"text\"], return_tensors=\"pt\", padding='max_length', truncation=True)\n", - "inputs = {key: val.to(device) for key, val in inputs.items()}\n", - "\n", - "with torch.no_grad():\n", - " outputs = model(**inputs)\n", - "\n", - "outputs.logits" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BHE26Mr2NXhH", - "outputId": "24569931-61d7-4752-8b08-4daef58f9798" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "tensor([[[-5.6250, -5.5938, -5.5938, ..., -5.4688, -4.9688, -2.4844],\n", - " [-9.2500, -8.9375, -9.3750, ..., -8.5000, -7.5000, -4.0312],\n", - " [-4.9062, -4.8750, -5.2812, ..., -5.0625, -4.4375, -1.8281],\n", - " ...,\n", - " [-5.5938, -5.7500, -5.7812, ..., -6.1562, -3.9688, -2.2812],\n", - " [-4.7188, -4.8750, -4.8750, ..., -5.0625, -3.4531, -2.4375],\n", - " [-4.1875, -3.9375, -3.9062, ..., -3.3438, -3.2344, -3.2031]]],\n", - " device='cuda:0', dtype=torch.bfloat16)" - ] - }, - "metadata": {}, - "execution_count": 9 - } - ] - }, - { - "cell_type": "code", - "source": [ - "with torch.inference_mode():\n", - " model.register_buffer(\"attn_mask\", ltr_mask(model.config.max_position_embeddings).to(model.device))\n", - " outputs = model(**inputs)\n", - "\n", - "outputs.logits" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ZtEm7eQQNi4e", - "outputId": "c0eb3925-6d48-480e-a853-5057f35dbcd2" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "tensor([[[-7.9062, -7.7812, -7.9062, ..., -7.1250, -7.8438, -4.8438],\n", - " [-7.1562, -7.1250, -7.2812, ..., -7.3750, -7.3750, -7.2500],\n", - " [-5.4062, -5.2188, -5.4375, ..., -5.3438, -4.3750, -5.0312],\n", - " ...,\n", - " [ 3.9844, 3.6406, 3.6406, ..., 3.8281, 2.9062, 5.2812],\n", - " [ 4.0938, 3.7812, 3.8281, ..., 4.0000, 2.9844, 5.5000],\n", - " [ 3.8281, 3.5312, 3.5156, ..., 4.1562, 2.8438, 4.7188]]],\n", - " device='cuda:0', dtype=torch.bfloat16)" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ] - }, - { - "cell_type": "code", - "source": [ - "with torch.inference_mode():\n", - " model.register_buffer(\"attn_mask\", rtl_mask(model.config.max_position_embeddings).to(model.device))\n", - " outputs = model(**inputs)\n", - "\n", - "outputs.logits" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "nz0j7V3oNkZu", - "outputId": "939b1d6d-5dca-41ef-eb17-9e0f4d09629e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n", - "Post-forward checks\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "tensor([[[-5.6250, -5.5938, -5.5938, ..., -5.4688, -4.9688, -2.4844],\n", - " [-9.2500, -8.9375, -9.3750, ..., -8.5000, -7.5000, -4.0312],\n", - " [-4.9062, -4.8750, -5.2812, ..., -5.0625, -4.4375, -1.8281],\n", - " ...,\n", - " [-5.5938, -5.7500, -5.7812, ..., -6.1562, -3.9688, -2.2812],\n", - " [-4.7188, -4.8750, -4.8750, ..., -5.0625, -3.4531, -2.4375],\n", - " [-4.1875, -3.9375, -3.9062, ..., -3.3438, -3.2344, -3.2031]]],\n", - " device='cuda:0', dtype=torch.bfloat16)" - ] - }, - "metadata": {}, - "execution_count": 11 - } - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "82PpSWnrdMgu" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# Wikipedia test\n", - "from datasets import load_dataset\n", - "\n", - "ds = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\")\n", - "print(ds[\"train\"][1000])" - ], - "metadata": { - "id": "DHftDnPKdMjV" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "FTMkfLyKdMqu" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/causal_v2.ipynb b/causal_v2.ipynb deleted file mode 100644 index 2d91a12..0000000 --- a/causal_v2.ipynb +++ /dev/null @@ -1,1223 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "8ddb479e-9d7e-4392-8fc0-fd1c66a07a2b", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import transformers\n", - "transformers.set_seed(42)\n", - "device = \"cuda\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "fef43d6f-5164-405e-bdc6-8484283c134b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n", - " - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes\n", - " - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n", - " - If you are not the owner of the model architecture class, please contact the model code owner to update it.\n", - "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", - "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - } - ], - "source": [ - "from transformers import AutoModelForMaskedLM\n", - "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\", torch_dtype=torch.float16, attn_implementation=\"sdpa\").to(device)\n", - "# model = BertForMaskedLM.from_pretrained(\"bert-base-uncased\", torch_dtype=torch.float16, attn_implementation=\"sdpa\").to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "39475a5f-63a3-4957-92b0-caf75bfe40bf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "12" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.config.num_attention_heads" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "b6eb1d9c-519f-4e02-890e-3acb8dfffd08", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "model.config.is_decoder = True # this was super important" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "43acd054-4351-409d-a1a0-62b1c101f00f", - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import AutoTokenizer\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "0a132496-6c2d-4494-9c37-a60c632a00d1", - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "\n", - "ds = load_dataset(\"Salesforce/wikitext\", \"wikitext-103-v1\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "0785e905-cb48-4d0f-878c-42276dce31c6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[0, 0, 0, 0, 0],\n", - " [1, 0, 0, 0, 0],\n", - " [1, 1, 0, 0, 0],\n", - " [1, 1, 1, 0, 0],\n", - " [1, 1, 1, 1, 0]])\n", - "tensor([[0, 1, 1, 1, 1, 1],\n", - " [0, 0, 1, 1, 1, 1],\n", - " [0, 0, 0, 1, 1, 1],\n", - " [0, 0, 0, 0, 1, 1],\n", - " [0, 0, 0, 0, 0, 1],\n", - " [0, 0, 0, 0, 0, 0]])\n" - ] - } - ], - "source": [ - "def ltrattn(shape):\n", - " mask = torch.full(shape,1)\n", - " return torch.tril(mask, diagonal=-1)\n", - "\n", - "def rtlattn(shape):\n", - " mask = torch.full(shape,1)\n", - " return torch.triu(mask, diagonal=1)\n", - "\n", - "print(ltrattn((5,5)))\n", - "print(rtlattn((6,6)))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "0b886fde-2f43-4112-bb5a-b5038502902d", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([1, 512])" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_ds = ds[\"train\"]\n", - "inputs = tokenizer(train_ds[10][\"text\"], return_tensors=\"pt\", padding='max_length', truncation=True)\n", - "\n", - "inputs[\"input_ids\"].size()" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "id": "e906cacf-cf4f-41c3-9c29-99ab895e171a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\" It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . \\n\"" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_ds[5][\"text\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "65b45df5-1d56-4042-af24-82bf68ae3fe1", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "' The game \\'s battle system , the system , is carried over directly from Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \\' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific \" Potentials \" , skills unique to each character . They are divided into \" Personal Potential \" , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character , and \" Battle Potentials \" , which are grown throughout the game and always grant boons to a character . To learn Battle Potentials , each character has a unique \" Masters Table \" , a grid @-@ based skill table that can be used to acquire and link different skills . Characters also have Special Abilities that grant them temporary boosts on the battlefield : Kurt can activate \" Direct Command \" and move around the battlefield without depleting his Action Point gauge , the character can shift into her \" Valkyria Form \" and become invincible , while Imca can target multiple enemy units with her heavy weapon . \\n'" - ] - }, - "execution_count": 55, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_ds[10][\"text\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "30472f6c-31d6-4768-8dca-d3535be28501", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1469)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1468 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", - "\u001b[0m\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> attention_mask.size()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1, 512])\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> p encoder_attention_mask.size()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1, 512, 512])\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> l\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;32m 1464 \u001b[0m \"\"\"\n", - "\u001b[1;32m 1465 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1466 \u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_return_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1467 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1468 \u001b[0m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", - "\u001b[0m\u001b[1;32m 1470 \u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1471 \u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1472 \u001b[0m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1473 \u001b[0m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1474 \u001b[0m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1470)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1469 \u001b[0;31m outputs = self.bert(\n", - "\u001b[0m\u001b[0;32m-> 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1471 \u001b[0;31m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1471)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1471 \u001b[0;31m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1472 \u001b[0;31m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1472)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1471 \u001b[0;31m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1472 \u001b[0;31m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1473 \u001b[0;31m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1473)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1472 \u001b[0;31m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1473 \u001b[0;31m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1474 \u001b[0;31m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1474)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1473 \u001b[0;31m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1474 \u001b[0;31m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1475 \u001b[0;31m \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1475)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1474 \u001b[0;31m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1475 \u001b[0;31m \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1476 \u001b[0;31m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1476)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1475 \u001b[0;31m \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1476 \u001b[0;31m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1477 \u001b[0;31m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1477)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1476 \u001b[0;31m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1477 \u001b[0;31m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1478 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1478)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1477 \u001b[0;31m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1478 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1479 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1479)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1478 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1479 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1480 \u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1480)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1479 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1480 \u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1481 \u001b[0;31m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1469)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1468 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", - "\u001b[0m\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--Call--\n", - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1549)\u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1548 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1549 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_wrapped_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1550 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1550)\u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1549 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_wrapped_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1550 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1551 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1553)\u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1552 \u001b[0;31m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1553 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1554 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--Call--\n", - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1555)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1554 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1555 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1556 \u001b[0;31m \u001b[0mforward_call\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_tracing_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1556)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1555 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1556 \u001b[0;31m \u001b[0mforward_call\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_tracing_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1557 \u001b[0;31m \u001b[0;31m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1559)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1558 \u001b[0;31m \u001b[0;31m# this function, and just call forward.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1559 \u001b[0;31m if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n", - "\u001b[0m\u001b[0;32m 1560 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1560)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1559 \u001b[0;31m if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n", - "\u001b[0m\u001b[0;32m-> 1560 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1561 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_pre_hooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1561)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1560 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1561 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_pre_hooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1562 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1562)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1561 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_pre_hooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1562 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1563 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> s\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--Call--\n", - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1005)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1004 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1005 \u001b[0;31m \u001b[0;34m@\u001b[0m\u001b[0madd_start_docstrings_to_model_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBERT_INPUTS_DOCSTRING\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"batch_size, sequence_length\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1006 \u001b[0;31m @add_code_sample_docstrings(\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> l\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;32m 1000 \u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mPreTrainedModel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1001 \u001b[0m \"\"\"\n", - "\u001b[1;32m 1002 \u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlayer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheads\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mheads_to_prune\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1003 \u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprune_heads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheads\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1004 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m-> 1005 \u001b[0;31m \u001b[0;34m@\u001b[0m\u001b[0madd_start_docstrings_to_model_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBERT_INPUTS_DOCSTRING\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"batch_size, sequence_length\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[1;32m 1006 \u001b[0m @add_code_sample_docstrings(\n", - "\u001b[1;32m 1007 \u001b[0m \u001b[0mcheckpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_CHECKPOINT_FOR_DOC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1008 \u001b[0m \u001b[0moutput_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mBaseModelOutputWithPoolingAndCrossAttentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1009 \u001b[0m \u001b[0mconfig_class\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_CONFIG_FOR_DOC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1010 \u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1047)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1046 \u001b[0;31m \"\"\"\n", - "\u001b[0m\u001b[0;32m-> 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1048 \u001b[0;31m output_hidden_states = (\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> self\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "BertModel(\n", - " (embeddings): BertEmbeddings(\n", - " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", - " (position_embeddings): Embedding(512, 768)\n", - " (token_type_embeddings): Embedding(2, 768)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (encoder): BertEncoder(\n", - " (layer): ModuleList(\n", - " (0-11): 12 x BertLayer(\n", - " (attention): BertAttention(\n", - " (self): BertSdpaSelfAttention(\n", - " (query): Linear(in_features=768, out_features=768, bias=True)\n", - " (key): Linear(in_features=768, out_features=768, bias=True)\n", - " (value): Linear(in_features=768, out_features=768, bias=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (output): BertSelfOutput(\n", - " (dense): Linear(in_features=768, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " (intermediate): BertIntermediate(\n", - " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", - " (intermediate_act_fn): GELUActivation()\n", - " )\n", - " (output): BertOutput(\n", - " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", - " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", - " (dropout): Dropout(p=0.1, inplace=False)\n", - " )\n", - " )\n", - " )\n", - " )\n", - ")\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> attention_mask\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", - " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> l\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;32m 1042 \u001b[0m \u001b[0;34m`\u001b[0m\u001b[0mdecoder_input_ids\u001b[0m\u001b[0;34m`\u001b[0m \u001b[0mof\u001b[0m \u001b[0mshape\u001b[0m \u001b[0;34m`\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msequence_length\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1043 \u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0mbool\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0moptional\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1044 \u001b[0m If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n", - "\u001b[1;32m 1045 \u001b[0m \u001b[0;34m`\u001b[0m\u001b[0mpast_key_values\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1046 \u001b[0m \"\"\"\n", - "\u001b[0;32m-> 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[1;32m 1048 \u001b[0m output_hidden_states = (\n", - "\u001b[1;32m 1049 \u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1050 \u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1051 \u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_return_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 1052 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1047)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1046 \u001b[0;31m \"\"\"\n", - "\u001b[0m\u001b[0;32m-> 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1048 \u001b[0;31m output_hidden_states = (\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1049)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1048 \u001b[0;31m output_hidden_states = (\n", - "\u001b[0m\u001b[0;32m-> 1049 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 1050 \u001b[0;31m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1048)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1048 \u001b[0;31m output_hidden_states = (\n", - "\u001b[0m\u001b[0;32m 1049 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> q\n" - ] - } - ], - "source": [ - "output = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=rtlattn(inputs[\"input_ids\"].size() + (inputs[\"input_ids\"].size(1),)))" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "id": "43a301aa-3113-46bb-a65d-9ed12bae9437", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[-6.3281, -6.3555, -6.4531, ..., -5.5234, -4.1797, -5.7891],\n", - " [-6.7891, -6.6914, -6.7812, ..., -6.1680, -5.1094, -5.5273],\n", - " [-7.1641, -7.1055, -7.0625, ..., -6.2383, -5.3711, -5.5273],\n", - " ...,\n", - " [-8.3516, -8.4375, -8.3516, ..., -7.6289, -7.0078, -5.6016],\n", - " [-7.7617, -7.8789, -7.7695, ..., -7.0938, -6.7461, -5.0430],\n", - " [-7.6602, -7.7500, -7.6953, ..., -6.9492, -6.4766, -4.9531]]],\n", - " device='cuda:0', dtype=torch.float16, grad_fn=)" - ] - }, - "execution_count": 140, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output.logits" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "954bc5bd-16af-44d5-9739-24cc89ed3ce0", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1469)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 1468 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", - "\u001b[0m\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> c\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(444)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 443 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m--> 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", - "\u001b[0m\u001b[0;32m 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> l\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[1;32m 439 \u001b[0m is_causal = (\n", - "\u001b[1;32m 440 \u001b[0m \u001b[0;32mTrue\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_decoder\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_cross_attention\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mattention_mask\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtgt_len\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 441 \u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 442 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 443 \u001b[0m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m--> 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", - "\u001b[0m\u001b[1;32m 445 \u001b[0m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 446 \u001b[0m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 447 \u001b[0m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 448 \u001b[0m \u001b[0mattn_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[1;32m 449 \u001b[0m \u001b[0mdropout_p\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout_prob\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> n\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(445)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", - "\u001b[0m\u001b[0;32m--> 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 446 \u001b[0;31m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(446)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m--> 446 \u001b[0;31m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m 447 \u001b[0;31m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> attention_mask\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[[[ 0., -65504., -65504., ..., -65504., -65504., -65504.],\n", - " [ 0., 0., -65504., ..., -65504., -65504., -65504.],\n", - " [ 0., 0., 0., ..., -65504., -65504., -65504.],\n", - " ...,\n", - " [ 0., 0., 0., ..., -65504., -65504., -65504.],\n", - " [ 0., 0., 0., ..., -65504., -65504., -65504.],\n", - " [ 0., 0., 0., ..., -65504., -65504., -65504.]]]],\n", - " device='cuda:0', dtype=torch.float16)\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> attention_mask.size()\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1, 1, 512, 512])\n", - "--KeyboardInterrupt--\n", - "\n", - "KeyboardInterrupt: Interrupted by user\n", - "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(444)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m 443 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\u001b[0;32m--> 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", - "\u001b[0m\u001b[0;32m 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0m\n" - ] - }, - { - "name": "stdin", - "output_type": "stream", - "text": [ - "ipdb> q\n", - "ipdb> q\n" - ] - } - ], - "source": [ - "output2 = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=ltrattn(inputs[\"input_ids\"].size() + (inputs[\"input_ids\"].size(1),)))" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "id": "327929a1-04da-45fa-846e-4998ac87cc26", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[-6.3281, -6.3555, -6.4531, ..., -5.5234, -4.1797, -5.7891],\n", - " [-6.7891, -6.6914, -6.7812, ..., -6.1680, -5.1094, -5.5273],\n", - " [-7.1641, -7.1055, -7.0625, ..., -6.2383, -5.3711, -5.5273],\n", - " ...,\n", - " [-8.3516, -8.4375, -8.3516, ..., -7.6289, -7.0078, -5.6016],\n", - " [-7.7617, -7.8789, -7.7695, ..., -7.0938, -6.7461, -5.0430],\n", - " [-7.6602, -7.7500, -7.6953, ..., -6.9492, -6.4766, -4.9531]]],\n", - " device='cuda:0', dtype=torch.float16, grad_fn=)" - ] - }, - "execution_count": 138, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output2.logits" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "id": "420d0bed-923c-452d-ab20-21a9440d4c8f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "torch.equal(output.logits, output2.logits)" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "id": "a33632fb-ad41-49e3-acee-91d1dda974b8", - "metadata": {}, - "outputs": [], - "source": [ - "output2 = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=torch.zeros(1, 512, 512))" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "id": "5042c85c-c98b-45ed-8437-8a98f63507d2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ -8.8438, -8.8750, -8.7812, ..., -8.3672, -8.1484, -4.5195],\n", - " [-12.5547, -12.2734, -12.4609, ..., -11.4141, -10.2969, -7.3320],\n", - " [-12.8125, -12.8125, -12.7891, ..., -12.1328, -10.5781, -5.9453],\n", - " ...,\n", - " [ -7.9531, -8.1797, -8.2266, ..., -7.2188, -6.5000, -6.4688],\n", - " [ -7.5234, -7.7344, -7.7305, ..., -6.7344, -6.3359, -6.0195],\n", - " [ -7.8711, -7.9453, -8.0156, ..., -7.3555, -7.1523, -5.6680]]],\n", - " device='cuda:0', dtype=torch.float16, grad_fn=)" - ] - }, - "execution_count": 145, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output2.logits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1f8144a4-c496-4b45-99f8-95c8fe41ce16", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9932c9d4-ae85-4f95-bf84-78be2000131d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/finetune_bert.py b/finetune_bert.py new file mode 100644 index 0000000..59c8090 --- /dev/null +++ b/finetune_bert.py @@ -0,0 +1,663 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. + +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=text-generation +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +""" +From https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from itertools import chain +from typing import Optional + +import datasets +import evaluate +import torch +from datasets import load_dataset + +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + is_torch_xla_available, + set_seed, +) +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.47.0.dev0") + +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + # text_direction: str = field( + # + # ) + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch." + ) + }, + ) + # model_type: Optional[str] = field( + # default=None, + # metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + # ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ) + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) + torch_dtype: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " + "dtype will be automatically derived from the model's weights." + ), + "choices": ["auto", "bfloat16", "float16", "float32"], + }, + ) + low_cpu_mem_usage: bool = field( + default=False, + metadata={ + "help": ( + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " + "set True will benefit LLM loading time and RAM consumption." + ) + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) + block_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + ) + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.streaming: + require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") + + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_clm", model_args, data_args) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + token=model_args.token, + streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + token=model_args.token, + streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + token=model_args.token, + streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + token=model_args.token, + **dataset_args, + ) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + token=model_args.token, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + token=model_args.token, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + logger.info(f"New config: {config}") + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + torch_dtype = ( + model_args.torch_dtype + if model_args.torch_dtype in ["auto", None] + else getattr(torch, model_args.torch_dtype) + ) + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + torch_dtype=torch_dtype, + low_cpu_mem_usage=model_args.low_cpu_mem_usage, + ) + else: + model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) + n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = list(raw_datasets["train"].features) + else: + column_names = list(raw_datasets["validation"].features) + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" + " before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + ) + if hasattr(config, "max_position_embeddings"): + max_pos_embeddings = config.max_position_embeddings + else: + # Define a default value if the attribute is missing in the config. + max_pos_embeddings = 1024 + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > max_pos_embeddings: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx." + ) + if max_pos_embeddings > 0: + block_size = min(1024, max_pos_embeddings) + else: + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/process#map + + with training_args.main_process_first(desc="grouping texts together"): + if not data_args.streaming: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + ) + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + + def preprocess_logits_for_metrics(logits, labels): + if isinstance(logits, tuple): + # Depending on the model and config, logits may contain extra tensors, + # like past_key_values, but logits always come first + logits = logits[0] + return logits.argmax(dim=-1) + + metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir) + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # preds have the same shape as the labels, after the argmax(-1) has been calculated + # by preprocess_logits_for_metrics but we need to shift the labels + labels = labels[:, 1:].reshape(-1) + preds = preds[:, :-1].reshape(-1) + return metric.compute(predictions=preds, references=labels) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + processing_class=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics + if training_args.do_eval and not is_torch_xla_available() + else None, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/notebooks/Right_to_Left_NLP.ipynb b/notebooks/Right_to_Left_NLP.ipynb new file mode 100644 index 0000000..bcbc57a --- /dev/null +++ b/notebooks/Right_to_Left_NLP.ipynb @@ -0,0 +1,373 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "%pip install datasets torch transformers" + ], + "metadata": { + "collapsed": true, + "id": "M29-oTOBIiMr" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2iJJyERxHWSO", + "outputId": "04e9bc9d-5ee9-48d5-f370-6fd66ec7b7c1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "cpu\n" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import transformers\n", + "from datasets import load_dataset\n", + "\n", + "transformers.set_seed(42)\n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "print(device)" + ] + }, + { + "cell_type": "code", + "source": [ + "model_name_or_path = \"bert-base-uncased\"\n", + "model = transformers.AutoModelForMaskedLM.from_pretrained(model_name_or_path, torch_dtype=torch.bfloat16, attn_implementation=\"sdpa\")\n", + "model.eval()\n", + "\n", + "tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "C5PdaHGWHuXG", + "outputId": "d15272a5-1ce1-4c7e-9004-fc686a3de6b9" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n", + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "def ltr_mask(seq_len: int) -> torch.Tensor:\n", + " mask = torch.ones((seq_len, seq_len), dtype=bool)\n", + " return torch.tril(mask, diagonal=-1)\n", + "\n", + "def rtl_mask(seq_len: int) -> torch.Tensor:\n", + " return ltr_mask(seq_len).T" + ], + "metadata": { + "id": "H_AUjBRoJHXU" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "model.register_buffer(\"attn_mask\", rtl_mask(model.config.max_position_embeddings).to(model.device))\n", + "\n", + "def attn_hook(attn_module: nn.Module, args: tuple, kwargs: dict):\n", + " \"\"\"\n", + " Assuming https://github.com/huggingface/transformers/blob/33868a057c02f0368ba63bd1edb746be38fe3d90/src/transformers/models/bert/modeling_bert.py#L515\n", + " so no `kwargs` and `attention_mask` is second positional arg.\n", + "\n", + " Uses global `model.attn_mask` to save memory.\n", + " \"\"\"\n", + " assert not kwargs\n", + "\n", + " args = list(args)\n", + " assert args[1].size()[-2:] == model.attn_mask.size(), f\"{args[1].size()=} {model.attn_mask.size()=}\"\n", + " args[1] = model.attn_mask\n", + " return tuple(args), kwargs\n", + "\n", + "def debug_inputs_hook(attn_module: nn.Module, args: tuple, output):\n", + " print(f\"Post-forward checks\")\n", + " assert torch.equal(args[1], model.attn_mask), (args[1], model.attn_mask)" + ], + "metadata": { + "id": "Oy27MZcLLLsD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# attn_mask = rtl_mask(model.config.max_position_embeddings)\n", + "for name, module in model.named_modules():\n", + " if isinstance(module, transformers.models.bert.modeling_bert.BertSelfAttention):\n", + " module._forward_hooks.clear() # running multiple times right now during testing\n", + " module.register_forward_pre_hook(attn_hook, with_kwargs=True)\n", + " module.register_forward_hook(debug_inputs_hook)\n", + " # module.register_buffer(\"attn_mask\", attn_mask)\n", + "\n", + "model = model.to(device)" + ], + "metadata": { + "id": "anEdwKj_OWWy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ds = load_dataset(\"Salesforce/wikitext\", \"wikitext-103-v1\")" + ], + "metadata": { + "id": "P1BEQFsLIRfX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "train_ds = ds[\"train\"]\n", + "inputs = tokenizer(train_ds[5][\"text\"], return_tensors=\"pt\", padding='max_length', truncation=True)\n", + "inputs = {key: val.to(device) for key, val in inputs.items()}\n", + "\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + "\n", + "outputs.logits" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BHE26Mr2NXhH", + "outputId": "24569931-61d7-4752-8b08-4daef58f9798" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[[-5.6250, -5.5938, -5.5938, ..., -5.4688, -4.9688, -2.4844],\n", + " [-9.2500, -8.9375, -9.3750, ..., -8.5000, -7.5000, -4.0312],\n", + " [-4.9062, -4.8750, -5.2812, ..., -5.0625, -4.4375, -1.8281],\n", + " ...,\n", + " [-5.5938, -5.7500, -5.7812, ..., -6.1562, -3.9688, -2.2812],\n", + " [-4.7188, -4.8750, -4.8750, ..., -5.0625, -3.4531, -2.4375],\n", + " [-4.1875, -3.9375, -3.9062, ..., -3.3438, -3.2344, -3.2031]]],\n", + " device='cuda:0', dtype=torch.bfloat16)" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ] + }, + { + "cell_type": "code", + "source": [ + "with torch.inference_mode():\n", + " model.register_buffer(\"attn_mask\", ltr_mask(model.config.max_position_embeddings).to(model.device))\n", + " outputs = model(**inputs)\n", + "\n", + "outputs.logits" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZtEm7eQQNi4e", + "outputId": "c0eb3925-6d48-480e-a853-5057f35dbcd2" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[[-7.9062, -7.7812, -7.9062, ..., -7.1250, -7.8438, -4.8438],\n", + " [-7.1562, -7.1250, -7.2812, ..., -7.3750, -7.3750, -7.2500],\n", + " [-5.4062, -5.2188, -5.4375, ..., -5.3438, -4.3750, -5.0312],\n", + " ...,\n", + " [ 3.9844, 3.6406, 3.6406, ..., 3.8281, 2.9062, 5.2812],\n", + " [ 4.0938, 3.7812, 3.8281, ..., 4.0000, 2.9844, 5.5000],\n", + " [ 3.8281, 3.5312, 3.5156, ..., 4.1562, 2.8438, 4.7188]]],\n", + " device='cuda:0', dtype=torch.bfloat16)" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "with torch.inference_mode():\n", + " model.register_buffer(\"attn_mask\", rtl_mask(model.config.max_position_embeddings).to(model.device))\n", + " outputs = model(**inputs)\n", + "\n", + "outputs.logits" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nz0j7V3oNkZu", + "outputId": "939b1d6d-5dca-41ef-eb17-9e0f4d09629e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n", + "Post-forward checks\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "tensor([[[-5.6250, -5.5938, -5.5938, ..., -5.4688, -4.9688, -2.4844],\n", + " [-9.2500, -8.9375, -9.3750, ..., -8.5000, -7.5000, -4.0312],\n", + " [-4.9062, -4.8750, -5.2812, ..., -5.0625, -4.4375, -1.8281],\n", + " ...,\n", + " [-5.5938, -5.7500, -5.7812, ..., -6.1562, -3.9688, -2.2812],\n", + " [-4.7188, -4.8750, -4.8750, ..., -5.0625, -3.4531, -2.4375],\n", + " [-4.1875, -3.9375, -3.9062, ..., -3.3438, -3.2344, -3.2031]]],\n", + " device='cuda:0', dtype=torch.bfloat16)" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "82PpSWnrdMgu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Wikipedia test\n", + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"wikimedia/wikipedia\", \"20231101.en\")\n", + "print(ds[\"train\"][1000])" + ], + "metadata": { + "id": "DHftDnPKdMjV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "FTMkfLyKdMqu" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/causal_v2.ipynb b/notebooks/causal_v2.ipynb new file mode 100644 index 0000000..2d91a12 --- /dev/null +++ b/notebooks/causal_v2.ipynb @@ -0,0 +1,1223 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "8ddb479e-9d7e-4392-8fc0-fd1c66a07a2b", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import transformers\n", + "transformers.set_seed(42)\n", + "device = \"cuda\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fef43d6f-5164-405e-bdc6-8484283c134b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n", + " - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes\n", + " - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n", + " - If you are not the owner of the model architecture class, please contact the model code owner to update it.\n", + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + } + ], + "source": [ + "from transformers import AutoModelForMaskedLM\n", + "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\", torch_dtype=torch.float16, attn_implementation=\"sdpa\").to(device)\n", + "# model = BertForMaskedLM.from_pretrained(\"bert-base-uncased\", torch_dtype=torch.float16, attn_implementation=\"sdpa\").to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "39475a5f-63a3-4957-92b0-caf75bfe40bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.config.num_attention_heads" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b6eb1d9c-519f-4e02-890e-3acb8dfffd08", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "model.config.is_decoder = True # this was super important" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "43acd054-4351-409d-a1a0-62b1c101f00f", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0a132496-6c2d-4494-9c37-a60c632a00d1", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"Salesforce/wikitext\", \"wikitext-103-v1\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0785e905-cb48-4d0f-878c-42276dce31c6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[0, 0, 0, 0, 0],\n", + " [1, 0, 0, 0, 0],\n", + " [1, 1, 0, 0, 0],\n", + " [1, 1, 1, 0, 0],\n", + " [1, 1, 1, 1, 0]])\n", + "tensor([[0, 1, 1, 1, 1, 1],\n", + " [0, 0, 1, 1, 1, 1],\n", + " [0, 0, 0, 1, 1, 1],\n", + " [0, 0, 0, 0, 1, 1],\n", + " [0, 0, 0, 0, 0, 1],\n", + " [0, 0, 0, 0, 0, 0]])\n" + ] + } + ], + "source": [ + "def ltrattn(shape):\n", + " mask = torch.full(shape,1)\n", + " return torch.tril(mask, diagonal=-1)\n", + "\n", + "def rtlattn(shape):\n", + " mask = torch.full(shape,1)\n", + " return torch.triu(mask, diagonal=1)\n", + "\n", + "print(ltrattn((5,5)))\n", + "print(rtlattn((6,6)))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0b886fde-2f43-4112-bb5a-b5038502902d", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "torch.Size([1, 512])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_ds = ds[\"train\"]\n", + "inputs = tokenizer(train_ds[10][\"text\"], return_tensors=\"pt\", padding='max_length', truncation=True)\n", + "\n", + "inputs[\"input_ids\"].size()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "e906cacf-cf4f-41c3-9c29-99ab895e171a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\" It met with positive sales in Japan , and was praised by both Japanese and western critics . After release , it received downloadable content , along with an expanded edition in November of that year . It was also adapted into manga and an original video animation series . Due to low sales of Valkyria Chronicles II , Valkyria Chronicles III was not localized , but a fan translation compatible with the game 's expanded edition was released in 2014 . Media.Vision would return to the franchise with the development of Valkyria : Azure Revolution for the PlayStation 4 . \\n\"" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_ds[5][\"text\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "65b45df5-1d56-4042-af24-82bf68ae3fe1", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "' The game \\'s battle system , the system , is carried over directly from Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \\' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific \" Potentials \" , skills unique to each character . They are divided into \" Personal Potential \" , which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character , and \" Battle Potentials \" , which are grown throughout the game and always grant boons to a character . To learn Battle Potentials , each character has a unique \" Masters Table \" , a grid @-@ based skill table that can be used to acquire and link different skills . Characters also have Special Abilities that grant them temporary boosts on the battlefield : Kurt can activate \" Direct Command \" and move around the battlefield without depleting his Action Point gauge , the character can shift into her \" Valkyria Form \" and become invincible , while Imca can target multiple enemy units with her heavy weapon . \\n'" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_ds[10][\"text\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "30472f6c-31d6-4768-8dca-d3535be28501", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1469)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1468 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", + "\u001b[0m\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> attention_mask.size()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 512])\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> p encoder_attention_mask.size()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 512, 512])\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> l\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;32m 1464 \u001b[0m \"\"\"\n", + "\u001b[1;32m 1465 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1466 \u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_return_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1467 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1468 \u001b[0m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", + "\u001b[0m\u001b[1;32m 1470 \u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1471 \u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1472 \u001b[0m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1473 \u001b[0m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1474 \u001b[0m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1470)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1469 \u001b[0;31m outputs = self.bert(\n", + "\u001b[0m\u001b[0;32m-> 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1471 \u001b[0;31m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1471)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1471 \u001b[0;31m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1472 \u001b[0;31m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1472)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1471 \u001b[0;31m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1472 \u001b[0;31m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1473 \u001b[0;31m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1473)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1472 \u001b[0;31m \u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1473 \u001b[0;31m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1474 \u001b[0;31m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1474)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1473 \u001b[0;31m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1474 \u001b[0;31m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1475 \u001b[0;31m \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1475)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1474 \u001b[0;31m \u001b[0mhead_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mhead_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1475 \u001b[0;31m \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1476 \u001b[0;31m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1476)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1475 \u001b[0;31m \u001b[0minputs_embeds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs_embeds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1476 \u001b[0;31m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1477 \u001b[0;31m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1477)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1476 \u001b[0;31m \u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1477 \u001b[0;31m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1478 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1478)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1477 \u001b[0;31m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1478 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1479 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1479)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1478 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1479 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1480 \u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1480)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1479 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1480 \u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1481 \u001b[0;31m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1469)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1468 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", + "\u001b[0m\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--Call--\n", + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1549)\u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1548 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1549 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_wrapped_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1550 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1550)\u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1549 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_wrapped_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1550 \u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1551 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1553)\u001b[0;36m_wrapped_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1552 \u001b[0;31m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1553 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1554 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--Call--\n", + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1555)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1554 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1555 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1556 \u001b[0;31m \u001b[0mforward_call\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_tracing_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1556)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1555 \u001b[0;31m \u001b[0;32mdef\u001b[0m \u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1556 \u001b[0;31m \u001b[0mforward_call\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_tracing_state\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1557 \u001b[0;31m \u001b[0;31m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1559)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1558 \u001b[0;31m \u001b[0;31m# this function, and just call forward.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1559 \u001b[0;31m if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n", + "\u001b[0m\u001b[0;32m 1560 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1560)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1559 \u001b[0;31m if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks\n", + "\u001b[0m\u001b[0;32m-> 1560 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1561 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_pre_hooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1561)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1560 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1561 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_pre_hooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1562 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/6.861/.venv/lib64/python3.12/site-packages/torch/nn/modules/module.py\u001b[0m(1562)\u001b[0;36m_call_impl\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1561 \u001b[0;31m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_forward_pre_hooks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1562 \u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1563 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--Call--\n", + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1005)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1004 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1005 \u001b[0;31m \u001b[0;34m@\u001b[0m\u001b[0madd_start_docstrings_to_model_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBERT_INPUTS_DOCSTRING\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"batch_size, sequence_length\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1006 \u001b[0;31m @add_code_sample_docstrings(\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> l\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;32m 1000 \u001b[0m \u001b[0;32mclass\u001b[0m \u001b[0mPreTrainedModel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1001 \u001b[0m \"\"\"\n", + "\u001b[1;32m 1002 \u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mlayer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mheads\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mheads_to_prune\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1003 \u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mlayer\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattention\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprune_heads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mheads\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1004 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m-> 1005 \u001b[0;31m \u001b[0;34m@\u001b[0m\u001b[0madd_start_docstrings_to_model_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mBERT_INPUTS_DOCSTRING\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"batch_size, sequence_length\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[1;32m 1006 \u001b[0m @add_code_sample_docstrings(\n", + "\u001b[1;32m 1007 \u001b[0m \u001b[0mcheckpoint\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_CHECKPOINT_FOR_DOC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1008 \u001b[0m \u001b[0moutput_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mBaseModelOutputWithPoolingAndCrossAttentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1009 \u001b[0m \u001b[0mconfig_class\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_CONFIG_FOR_DOC\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1010 \u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1047)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1046 \u001b[0;31m \"\"\"\n", + "\u001b[0m\u001b[0;32m-> 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1048 \u001b[0;31m output_hidden_states = (\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> self\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BertModel(\n", + " (embeddings): BertEmbeddings(\n", + " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", + " (position_embeddings): Embedding(512, 768)\n", + " (token_type_embeddings): Embedding(2, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): BertEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x BertLayer(\n", + " (attention): BertAttention(\n", + " (self): BertSdpaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): BertSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): BertIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): BertOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + ")\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> attention_mask\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> l\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;32m 1042 \u001b[0m \u001b[0;34m`\u001b[0m\u001b[0mdecoder_input_ids\u001b[0m\u001b[0;34m`\u001b[0m \u001b[0mof\u001b[0m \u001b[0mshape\u001b[0m \u001b[0;34m`\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msequence_length\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1043 \u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0mbool\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0moptional\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1044 \u001b[0m If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see\n", + "\u001b[1;32m 1045 \u001b[0m \u001b[0;34m`\u001b[0m\u001b[0mpast_key_values\u001b[0m\u001b[0;34m`\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1046 \u001b[0m \"\"\"\n", + "\u001b[0;32m-> 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[1;32m 1048 \u001b[0m output_hidden_states = (\n", + "\u001b[1;32m 1049 \u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1050 \u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1051 \u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreturn_dict\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_return_dict\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 1052 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1047)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1046 \u001b[0;31m \"\"\"\n", + "\u001b[0m\u001b[0;32m-> 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1048 \u001b[0;31m output_hidden_states = (\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1049)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1048 \u001b[0;31m output_hidden_states = (\n", + "\u001b[0m\u001b[0;32m-> 1049 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 1050 \u001b[0;31m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1048)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1047 \u001b[0;31m \u001b[0moutput_attentions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_attentions\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1048 \u001b[0;31m output_hidden_states = (\n", + "\u001b[0m\u001b[0;32m 1049 \u001b[0;31m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0moutput_hidden_states\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> q\n" + ] + } + ], + "source": [ + "output = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=rtlattn(inputs[\"input_ids\"].size() + (inputs[\"input_ids\"].size(1),)))" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "43a301aa-3113-46bb-a65d-9ed12bae9437", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[-6.3281, -6.3555, -6.4531, ..., -5.5234, -4.1797, -5.7891],\n", + " [-6.7891, -6.6914, -6.7812, ..., -6.1680, -5.1094, -5.5273],\n", + " [-7.1641, -7.1055, -7.0625, ..., -6.2383, -5.3711, -5.5273],\n", + " ...,\n", + " [-8.3516, -8.4375, -8.3516, ..., -7.6289, -7.0078, -5.6016],\n", + " [-7.7617, -7.8789, -7.7695, ..., -7.0938, -6.7461, -5.0430],\n", + " [-7.6602, -7.7500, -7.6953, ..., -6.9492, -6.4766, -4.9531]]],\n", + " device='cuda:0', dtype=torch.float16, grad_fn=)" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output.logits" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "954bc5bd-16af-44d5-9739-24cc89ed3ce0", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(1469)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 1468 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 1469 \u001b[0;31m outputs = self.bert(\n", + "\u001b[0m\u001b[0;32m 1470 \u001b[0;31m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> c\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(444)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 443 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m--> 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", + "\u001b[0m\u001b[0;32m 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> l\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;32m 439 \u001b[0m is_causal = (\n", + "\u001b[1;32m 440 \u001b[0m \u001b[0;32mTrue\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_decoder\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_cross_attention\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mattention_mask\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mtgt_len\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 441 \u001b[0m \u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 442 \u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 443 \u001b[0m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m--> 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", + "\u001b[0m\u001b[1;32m 445 \u001b[0m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 446 \u001b[0m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 447 \u001b[0m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 448 \u001b[0m \u001b[0mattn_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[1;32m 449 \u001b[0m \u001b[0mdropout_p\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdropout_prob\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtraining\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0.0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> n\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(445)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", + "\u001b[0m\u001b[0;32m--> 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 446 \u001b[0;31m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(446)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m--> 446 \u001b[0;31m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 447 \u001b[0;31m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> attention_mask\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[[ 0., -65504., -65504., ..., -65504., -65504., -65504.],\n", + " [ 0., 0., -65504., ..., -65504., -65504., -65504.],\n", + " [ 0., 0., 0., ..., -65504., -65504., -65504.],\n", + " ...,\n", + " [ 0., 0., 0., ..., -65504., -65504., -65504.],\n", + " [ 0., 0., 0., ..., -65504., -65504., -65504.],\n", + " [ 0., 0., 0., ..., -65504., -65504., -65504.]]]],\n", + " device='cuda:0', dtype=torch.float16)\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> attention_mask.size()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 1, 512, 512])\n", + "--KeyboardInterrupt--\n", + "\n", + "KeyboardInterrupt: Interrupted by user\n", + "> \u001b[0;32m/home/sipb/transformer-shortest-paths/NLP/transformers/src/transformers/models/bert/modeling_bert.py\u001b[0m(444)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 443 \u001b[0;31m \u001b[0mipdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m--> 444 \u001b[0;31m attn_output = torch.nn.functional.scaled_dot_product_attention(\n", + "\u001b[0m\u001b[0;32m 445 \u001b[0;31m \u001b[0mquery_layer\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> q\n", + "ipdb> q\n" + ] + } + ], + "source": [ + "output2 = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=ltrattn(inputs[\"input_ids\"].size() + (inputs[\"input_ids\"].size(1),)))" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "327929a1-04da-45fa-846e-4998ac87cc26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[-6.3281, -6.3555, -6.4531, ..., -5.5234, -4.1797, -5.7891],\n", + " [-6.7891, -6.6914, -6.7812, ..., -6.1680, -5.1094, -5.5273],\n", + " [-7.1641, -7.1055, -7.0625, ..., -6.2383, -5.3711, -5.5273],\n", + " ...,\n", + " [-8.3516, -8.4375, -8.3516, ..., -7.6289, -7.0078, -5.6016],\n", + " [-7.7617, -7.8789, -7.7695, ..., -7.0938, -6.7461, -5.0430],\n", + " [-7.6602, -7.7500, -7.6953, ..., -6.9492, -6.4766, -4.9531]]],\n", + " device='cuda:0', dtype=torch.float16, grad_fn=)" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output2.logits" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "id": "420d0bed-923c-452d-ab20-21a9440d4c8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.equal(output.logits, output2.logits)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "id": "a33632fb-ad41-49e3-acee-91d1dda974b8", + "metadata": {}, + "outputs": [], + "source": [ + "output2 = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=torch.zeros(1, 512, 512))" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "id": "5042c85c-c98b-45ed-8437-8a98f63507d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[ -8.8438, -8.8750, -8.7812, ..., -8.3672, -8.1484, -4.5195],\n", + " [-12.5547, -12.2734, -12.4609, ..., -11.4141, -10.2969, -7.3320],\n", + " [-12.8125, -12.8125, -12.7891, ..., -12.1328, -10.5781, -5.9453],\n", + " ...,\n", + " [ -7.9531, -8.1797, -8.2266, ..., -7.2188, -6.5000, -6.4688],\n", + " [ -7.5234, -7.7344, -7.7305, ..., -6.7344, -6.3359, -6.0195],\n", + " [ -7.8711, -7.9453, -8.0156, ..., -7.3555, -7.1523, -5.6680]]],\n", + " device='cuda:0', dtype=torch.float16, grad_fn=)" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output2.logits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f8144a4-c496-4b45-99f8-95c8fe41ce16", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9932c9d4-ae85-4f95-bf84-78be2000131d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/rtl.ipynb b/notebooks/rtl.ipynb new file mode 100644 index 0000000..039764c --- /dev/null +++ b/notebooks/rtl.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "8ddb479e-9d7e-4392-8fc0-fd1c66a07a2b", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 512])\n" + ] + } + ], + "source": [ + "import torch\n", + "import transformers\n", + "transformers.set_seed(42)\n", + "device = \"cuda\"\n", + "# import sys\n", + "\n", + "# for key in list(sys.modules):\n", + "# if key.startswith(\"transformers.\"):\n", + "# sys.modules.pop(key)\n", + "\n", + "from transformers import AutoModelForMaskedLM\n", + "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\", torch_dtype=torch.float16, attn_implementation=\"sdpa\").to(device)\n", + "\n", + "from transformers import AutoTokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", + "\n", + "model.config.alek_says_ltr = True\n", + "model.config.alek_says_rtl = False\n", + "from datasets import load_dataset\n", + "\n", + "ds = load_dataset(\"Salesforce/wikitext\", \"wikitext-103-v1\")\n", + "train_ds = ds[\"train\"]\n", + "inputs = tokenizer(train_ds[10][\"text\"], return_tensors=\"pt\", padding='max_length', truncation=True)\n", + "\n", + "print(inputs[\"input_ids\"].size())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "30472f6c-31d6-4768-8dca-d3535be28501", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "output = model(**{k: v.to(device) for k, v in inputs.items()})" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ac46bf41-8cd3-4190-aa4b-6142d6d4d986", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[ -7.1094, -7.1445, -7.2148, ..., -6.6484, -7.0703, -3.6758],\n", + " [-14.2969, -14.2656, -14.3828, ..., -11.9766, -11.3281, -9.4922],\n", + " [-10.7344, -10.6250, -10.7266, ..., -8.6641, -8.2188, -5.0859],\n", + " ...,\n", + " [ -3.4277, -3.5664, -3.9434, ..., -2.0000, -4.4727, -3.7148],\n", + " [ -3.7227, -3.8770, -4.2383, ..., -2.1367, -4.5977, -3.9336],\n", + " [ -4.2070, -4.3672, -4.7578, ..., -2.4941, -4.7734, -4.7227]]],\n", + " device='cuda:0', dtype=torch.float16, grad_fn=)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output.logits" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "baa757ff-3ba2-4a72-b819-a2283b729c18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[-6.3281, -6.3555, -6.4531, ..., -5.5234, -4.1797, -5.7891],\n", + " [-6.7891, -6.6914, -6.7812, ..., -6.1680, -5.1094, -5.5273],\n", + " [-7.1641, -7.1055, -7.0625, ..., -6.2383, -5.3711, -5.5273],\n", + " ...,\n", + " [ 9.4844, 8.9219, 9.2422, ..., 7.6133, 7.2578, 9.9062],\n", + " [10.3672, 9.8516, 10.1797, ..., 8.5547, 8.0781, 10.5938],\n", + " [ 8.3828, 8.0781, 8.1641, ..., 7.2422, 6.7734, 7.9961]]],\n", + " device='cuda:0', dtype=torch.float16, grad_fn=)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output.logits" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a33632fb-ad41-49e3-acee-91d1dda974b8", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# output1 = model(**{k: v.to(device) for k, v in inputs.items()})\n", + "# print(output1.logits)\n", + "# output2 = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=torch.zeros(1, 512, 512))\n", + "# print(output2.logits)" + ] + }, + { + "cell_type": "markdown", + "id": "ad432f29-f77a-4b84-b6b4-347b74c82f5b", + "metadata": {}, + "source": [ + "## plan for finishing phase 1\n", + "\n", + "- fix the tokenizer\n", + "- pretrain on RTL + LTR\n", + "- check perplexities\n", + "\n", + "## plan for phase 2\n", + "- AQ\n", + "\n", + "## plan for phase 1.5\n", + "- addition" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/official_run_clm.py b/official_run_clm.py new file mode 100644 index 0000000..d3f8ad8 --- /dev/null +++ b/official_run_clm.py @@ -0,0 +1,657 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. + +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=text-generation +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from itertools import chain +from typing import Optional + +import datasets +import evaluate +import torch +from datasets import load_dataset + +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + is_torch_xla_available, + set_seed, +) +from transformers.testing_utils import CaptureLogger +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version, send_example_telemetry +from transformers.utils.versions import require_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.47.0.dev0") + +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch." + ) + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_overrides: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override some existing default config settings when a model is trained from scratch. Example: " + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + ) + }, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + token: str = field( + default=None, + metadata={ + "help": ( + "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`)." + ) + }, + ) + trust_remote_code: bool = field( + default=False, + metadata={ + "help": ( + "Whether to trust the execution of code from datasets/models defined on the Hub." + " This option should only be set to `True` for repositories you trust and in which you have read the" + " code, as it will execute code present on the Hub on your local machine." + ) + }, + ) + torch_dtype: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " + "dtype will be automatically derived from the model's weights." + ), + "choices": ["auto", "bfloat16", "float16", "float32"], + }, + ) + low_cpu_mem_usage: bool = field( + default=False, + metadata={ + "help": ( + "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded. " + "set True will benefit LLM loading time and RAM consumption." + ) + }, + ) + + def __post_init__(self): + if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): + raise ValueError( + "--config_overrides can't be used in combination with --config_name or --model_name_or_path" + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + ) + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": ( + "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + ) + }, + ) + streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) + block_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Optional input sequence length after tokenization. " + "The training dataset will be truncated in block of this size for training. " + "Default to the model max input length for single sentence inputs (take into account special tokens)." + ) + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + keep_linebreaks: bool = field( + default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} + ) + + def __post_init__(self): + if self.streaming: + require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") + + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The + # information sent is the one passed as arguments along with your Python/PyTorch versions. + send_example_telemetry("run_clm", model_args, data_args) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + if training_args.should_log: + # The default of training_args.log_level is passive, so we set log level at info here to have that default. + transformers.utils.logging.set_verbosity_info() + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, " + + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, + token=model_args.token, + streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, + ) + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + token=model_args.token, + streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, + ) + raw_datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + token=model_args.token, + streaming=data_args.streaming, + trust_remote_code=model_args.trust_remote_code, + ) + else: + data_files = {} + dataset_args = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + dataset_args["keep_linebreaks"] = data_args.keep_linebreaks + raw_datasets = load_dataset( + extension, + data_files=data_files, + cache_dir=model_args.cache_dir, + token=model_args.token, + **dataset_args, + ) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + token=model_args.token, + **dataset_args, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + token=model_args.token, + **dataset_args, + ) + + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + if model_args.config_overrides is not None: + logger.info(f"Overriding config: {model_args.config_overrides}") + config.update_from_string(model_args.config_overrides) + logger.info(f"New config: {config}") + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "token": model_args.token, + "trust_remote_code": model_args.trust_remote_code, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script. " + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + torch_dtype = ( + model_args.torch_dtype + if model_args.torch_dtype in ["auto", None] + else getattr(torch, model_args.torch_dtype) + ) + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + torch_dtype=torch_dtype, + low_cpu_mem_usage=model_args.low_cpu_mem_usage, + ) + else: + model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) + n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) + logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") + + # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch + # on a small vocab and want a smaller embedding size, remove this test. + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = list(raw_datasets["train"].features) + else: + column_names = list(raw_datasets["validation"].features) + text_column_name = "text" if "text" in column_names else column_names[0] + + # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function + tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") + + def tokenize_function(examples): + with CaptureLogger(tok_logger) as cl: + output = tokenizer(examples[text_column_name]) + # clm input could be much much longer than block_size + if "Token indices sequence length is longer than the" in cl.out: + tok_logger.warning( + "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" + " before being passed to the model." + ) + return output + + with training_args.main_process_first(desc="dataset map tokenization"): + if not data_args.streaming: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + desc="Running tokenizer on dataset", + ) + else: + tokenized_datasets = raw_datasets.map( + tokenize_function, + batched=True, + remove_columns=column_names, + ) + if hasattr(config, "max_position_embeddings"): + max_pos_embeddings = config.max_position_embeddings + else: + # Define a default value if the attribute is missing in the config. + max_pos_embeddings = 1024 + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > max_pos_embeddings: + logger.warning( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx." + ) + if max_pos_embeddings > 0: + block_size = min(1024, max_pos_embeddings) + else: + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model " + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, and if the total_length < block_size we exclude this batch and return an empty dict. + # We could add padding if the model supported it instead of this drop, you can customize this part to your needs. + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/process#map + + with training_args.main_process_first(desc="grouping texts together"): + if not data_args.streaming: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + desc=f"Grouping texts in chunks of {block_size}", + ) + else: + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + ) + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + max_train_samples = min(len(train_dataset), data_args.max_train_samples) + train_dataset = train_dataset.select(range(max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_eval_samples is not None: + max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) + eval_dataset = eval_dataset.select(range(max_eval_samples)) + + def preprocess_logits_for_metrics(logits, labels): + if isinstance(logits, tuple): + # Depending on the model and config, logits may contain extra tensors, + # like past_key_values, but logits always come first + logits = logits[0] + return logits.argmax(dim=-1) + + metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir) + + def compute_metrics(eval_preds): + preds, labels = eval_preds + # preds have the same shape as the labels, after the argmax(-1) has been calculated + # by preprocess_logits_for_metrics but we need to shift the labels + labels = labels[:, 1:].reshape(-1) + preds = preds[:, :-1].reshape(-1) + return metric.compute(predictions=preds, references=labels) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + processing_class=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + compute_metrics=compute_metrics if training_args.do_eval and not is_torch_xla_available() else None, + preprocess_logits_for_metrics=preprocess_logits_for_metrics + if training_args.do_eval and not is_torch_xla_available() + else None, + ) + + # Training + if training_args.do_train: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: + checkpoint = last_checkpoint + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) + try: + perplexity = math.exp(metrics["eval_loss"]) + except OverflowError: + perplexity = float("inf") + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} + if data_args.dataset_name is not None: + kwargs["dataset_tags"] = data_args.dataset_name + if data_args.dataset_config_name is not None: + kwargs["dataset_args"] = data_args.dataset_config_name + kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" + else: + kwargs["dataset"] = data_args.dataset_name + + if training_args.push_to_hub: + trainer.push_to_hub(**kwargs) + else: + trainer.create_model_card(**kwargs) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..29f3cbd --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +datasets +evaluate +torch +transformers \ No newline at end of file diff --git a/rtl.ipynb b/rtl.ipynb deleted file mode 100644 index 039764c..0000000 --- a/rtl.ipynb +++ /dev/null @@ -1,191 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 4, - "id": "8ddb479e-9d7e-4392-8fc0-fd1c66a07a2b", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", - "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1, 512])\n" - ] - } - ], - "source": [ - "import torch\n", - "import transformers\n", - "transformers.set_seed(42)\n", - "device = \"cuda\"\n", - "# import sys\n", - "\n", - "# for key in list(sys.modules):\n", - "# if key.startswith(\"transformers.\"):\n", - "# sys.modules.pop(key)\n", - "\n", - "from transformers import AutoModelForMaskedLM\n", - "model = AutoModelForMaskedLM.from_pretrained(\"bert-base-uncased\", torch_dtype=torch.float16, attn_implementation=\"sdpa\").to(device)\n", - "\n", - "from transformers import AutoTokenizer\n", - "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n", - "\n", - "model.config.alek_says_ltr = True\n", - "model.config.alek_says_rtl = False\n", - "from datasets import load_dataset\n", - "\n", - "ds = load_dataset(\"Salesforce/wikitext\", \"wikitext-103-v1\")\n", - "train_ds = ds[\"train\"]\n", - "inputs = tokenizer(train_ds[10][\"text\"], return_tensors=\"pt\", padding='max_length', truncation=True)\n", - "\n", - "print(inputs[\"input_ids\"].size())" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "30472f6c-31d6-4768-8dca-d3535be28501", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "output = model(**{k: v.to(device) for k, v in inputs.items()})" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ac46bf41-8cd3-4190-aa4b-6142d6d4d986", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ -7.1094, -7.1445, -7.2148, ..., -6.6484, -7.0703, -3.6758],\n", - " [-14.2969, -14.2656, -14.3828, ..., -11.9766, -11.3281, -9.4922],\n", - " [-10.7344, -10.6250, -10.7266, ..., -8.6641, -8.2188, -5.0859],\n", - " ...,\n", - " [ -3.4277, -3.5664, -3.9434, ..., -2.0000, -4.4727, -3.7148],\n", - " [ -3.7227, -3.8770, -4.2383, ..., -2.1367, -4.5977, -3.9336],\n", - " [ -4.2070, -4.3672, -4.7578, ..., -2.4941, -4.7734, -4.7227]]],\n", - " device='cuda:0', dtype=torch.float16, grad_fn=)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output.logits" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "baa757ff-3ba2-4a72-b819-a2283b729c18", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[-6.3281, -6.3555, -6.4531, ..., -5.5234, -4.1797, -5.7891],\n", - " [-6.7891, -6.6914, -6.7812, ..., -6.1680, -5.1094, -5.5273],\n", - " [-7.1641, -7.1055, -7.0625, ..., -6.2383, -5.3711, -5.5273],\n", - " ...,\n", - " [ 9.4844, 8.9219, 9.2422, ..., 7.6133, 7.2578, 9.9062],\n", - " [10.3672, 9.8516, 10.1797, ..., 8.5547, 8.0781, 10.5938],\n", - " [ 8.3828, 8.0781, 8.1641, ..., 7.2422, 6.7734, 7.9961]]],\n", - " device='cuda:0', dtype=torch.float16, grad_fn=)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output.logits" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a33632fb-ad41-49e3-acee-91d1dda974b8", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "outputs": [], - "source": [ - "# output1 = model(**{k: v.to(device) for k, v in inputs.items()})\n", - "# print(output1.logits)\n", - "# output2 = model(**{k: v.to(device) for k, v in inputs.items()}, encoder_attention_mask=torch.zeros(1, 512, 512))\n", - "# print(output2.logits)" - ] - }, - { - "cell_type": "markdown", - "id": "ad432f29-f77a-4b84-b6b4-347b74c82f5b", - "metadata": {}, - "source": [ - "## plan for finishing phase 1\n", - "\n", - "- fix the tokenizer\n", - "- pretrain on RTL + LTR\n", - "- check perplexities\n", - "\n", - "## plan for phase 2\n", - "- AQ\n", - "\n", - "## plan for phase 1.5\n", - "- addition" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..7f922fe --- /dev/null +++ b/utils.py @@ -0,0 +1,45 @@ +import torch +import torch.nn as nn +import transformers + + +def ltr_mask(seq_len: int) -> torch.Tensor: + mask = torch.ones((seq_len, seq_len), dtype=torch.bool) + return torch.tril(mask, diagonal=-1) + + +def rtl_mask(seq_len: int) -> torch.Tensor: + return ltr_mask(seq_len).T + + +def add_attn_hooks(model: transformers.BertModel, text_direction: str) -> None: + """ + Forces bidirectional `model` into a unidirectional one based on `direction`. + Adds hooks to `model`'s self-attention blocks, in-place. + + Args: + model: only implemented for BERT models right now + text_direction: one of "ltr" or "rtl" + """ + assert text_direction.lower() in ("ltr", "rtl") + mask_func = ltr_mask if text_direction.lower() == "ltr" else rtl_mask + model.register_buffer("attn_mask", mask_func(model.config.max_position_embeddings).to(model.device)) + + def attn_hook(attn_module: nn.Module, args: tuple, kwargs: dict): + """ + Assuming https://github.com/huggingface/transformers/blob/33868a057c02f0368ba63bd1edb746be38fe3d90/src/transformers/models/bert/modeling_bert.py#L515 + so no `kwargs` and `attention_mask` is second positional arg. + + Uses nonlocal `model.attn_mask` to save memory. + """ + assert not kwargs + + args = list(args) + assert args[1].size()[-2:] == model.attn_mask.size(), f"{args[1].size()=} {model.attn_mask.size()=}" + args[1] = model.attn_mask + return tuple(args), kwargs + + for name, module in model.named_modules(): + if isinstance(module, transformers.models.bert.modeling_bert.BertSelfAttention): + module._forward_hooks.clear() # in case we run multiple times + module.register_forward_pre_hook(attn_hook, with_kwargs=True) -- cgit v1.2.3-70-g09d2