From 3467ea9ea53960ec68a49ce1dc3da44248304c4e Mon Sep 17 00:00:00 2001 From: jayeshthk Date: Sun, 29 Jun 2025 17:17:57 +0530 Subject: [PATCH 1/2] ModernBERT For Named Entity Recognition. Finetuned on Conll2003 dataset. --- examples/ModernBERT_NER.ipynb | 4913 +++++++++++++++++++++++++++++++++ 1 file changed, 4913 insertions(+) create mode 100644 examples/ModernBERT_NER.ipynb diff --git a/examples/ModernBERT_NER.ipynb b/examples/ModernBERT_NER.ipynb new file mode 100644 index 00000000..6b01e999 --- /dev/null +++ b/examples/ModernBERT_NER.ipynb @@ -0,0 +1,4913 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "N1_GgA-_uQRb" + }, + "outputs": [], + "source": [ + "# !pip install transformers datasets torch accelerate seqeval\n", + "# For GPU training\n", + "# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Vs4d2KX8xOtd", + "outputId": "f40c57b5-9cac-4ab3-85d2-63d279605c3f" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting seqeval\n", + " Downloading seqeval-1.2.2.tar.gz (43 kB)\n", + "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/43.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.11/dist-packages (from seqeval) (2.0.2)\n", + "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.11/dist-packages (from seqeval) (1.6.1)\n", + "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.15.3)\n", + "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.5.1)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.6.0)\n", + "Building wheels for collected packages: seqeval\n", + " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=b6d61b8ca1a1a12910469b2e683be4cae17bcb2d9febe655ae1ad4faacab144a\n", + " Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead\n", + "Successfully built seqeval\n", + "Installing collected packages: seqeval\n", + "Successfully installed seqeval-1.2.2\n" + ] + } + ], + "source": [ + "!pip install seqeval" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "IPJ3CXlIxlDC", + "outputId": "8bdd48dc-8c83-4182-bf53-c37f72ec4c6c" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: datasets in /usr/local/lib/python3.11/dist-packages (2.14.4)\n", + "Collecting datasets\n", + " Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from datasets) (3.18.0)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.0.2)\n", + "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (18.1.0)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.3.7)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)\n", + "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.32.3)\n", + "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.11/dist-packages (from datasets) (4.67.1)\n", + "Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets) (3.5.0)\n", + "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.70.15)\n", + "Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", + " Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)\n", + "Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.33.0)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n", + "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.11.15)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.14.0)\n", + "Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.1.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.4.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2.4.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2025.6.15)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.2)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.3.2)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.3.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.7.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.4.4)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.3.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.20.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", + "Downloading datasets-3.6.0-py3-none-any.whl (491 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m491.5/491.5 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.6/193.6 kB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: fsspec, datasets\n", + " Attempting uninstall: fsspec\n", + " Found existing installation: fsspec 2025.3.2\n", + " Uninstalling fsspec-2025.3.2:\n", + " Successfully uninstalled fsspec-2025.3.2\n", + " Attempting uninstall: datasets\n", + " Found existing installation: datasets 2.14.4\n", + " Uninstalling datasets-2.14.4:\n", + " Successfully uninstalled datasets-2.14.4\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-runtime-cu12 12.5.82 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cufft-cu12 11.2.3.61 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-curand-cu12==10.3.5.147; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-curand-cu12 10.3.6.82 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-cusolver-cu12==11.6.1.9; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cusolver-cu12 11.6.3.83 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-cusparse-cu12==12.3.1.170; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cusparse-cu12 12.5.1.3 which is incompatible.\n", + "torch 2.6.0+cu124 requires nvidia-nvjitlink-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-nvjitlink-cu12 12.5.82 which is incompatible.\n", + "gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed datasets-3.6.0 fsspec-2025.3.0\n" + ] + }, + { + "data": { + "application/vnd.colab-display-data+json": { + "id": "a2623491c78b4d0a82872ef849fec6b9", + "pip_warning": { + "packages": [ + "datasets", + "fsspec" + ] + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "!pip install -U datasets\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "l3o0vdx1uRbZ" + }, + "outputs": [], + "source": [ + "import torch\n", + "from datasets import load_dataset, Dataset\n", + "from transformers import (\n", + " AutoTokenizer,\n", + " AutoModelForTokenClassification,\n", + " TrainingArguments,\n", + " Trainer,\n", + " DataCollatorForTokenClassification\n", + ")\n", + "from seqeval.metrics import accuracy_score, classification_report, f1_score\n", + "import numpy as np\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LCdaBaN2uRd2", + "outputId": "4413ecc2-2066-4f1a-eba2-ba89ee147f01" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "\n", + "# Load ModernBERT tokenizer and model\n", + "model_name = \"answerdotai/ModernBERT-base\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 81, + "referenced_widgets": [ + "1c78ee94d28249c0b1ae757fce2b5d7b", + "f5b52e204b02491a8b606d73e043bd7f", + "6b27d8dfc81240439d8b50b1002efc92", + "195be4c448934cef922fb3af150aa389", + "dd89dac65d7b453da161009c88774d0b", + "12f722114e714af5a1138fd8605a2caa", + "02c589e29a2b4eb7a0692a262f371ddf", + "3f45bc7ba75843e1b0f6d0350e67a47f", + "a14305090ba0480e8c13ce3c0edaca8a", + "4b01b4ba50a240d3acd895b1dab7bdf7", + "fffeed9a91e24364b95d615debd1a68d", + "33f2b12bda244ccf81488963a7c1d45e", + "064c228d94214dbea5de716e2d9c8cad", + "57278ac9a4ea48d4b1a40b9e7006efb6", + "116bb67200f642a2bd9f46ae7fda9e69", + "669eb6af91b5415f96e14d852e79f26b", + "451fff47e8834615942477cebb979668", + "480e7e4afd0c4fe3a3ea4e67b7e67d4a", + "fb6a861f867c4516b4c891ace00e9081", + "a7e2337a4bb84124b1604d0bb7aeceee", + "18b5b0b98ec24b8e977be8a153b73c2a", + "3bd7a4fc2a804992a263b613d1bc982e" + ] + }, + "id": "4qUTkIOJuRgc", + "outputId": "6be687d4-6481-48b9-e628-a7a5d1775ff0" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1c78ee94d28249c0b1ae757fce2b5d7b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "README.md: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "33f2b12bda244ccf81488963a7c1d45e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "conll2003.py: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# Example: Load CoNLL-2003 dataset for NER\n", + "# You can replace this with your own dataset\n", + "dataset = load_dataset(\"conll2003\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "A_oLinbVuRii", + "outputId": "06ccf7fc-f134-40e3-df2d-8c4957d9eb48" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 14041\n", + " })\n", + " validation: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 3250\n", + " })\n", + " test: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 3453\n", + " })\n", + "})" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eG81u-sOuRkp", + "outputId": "99b2a3dc-af98-45d0-fcf8-baefceaf6208" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']\n", + "Number of labels: 9\n" + ] + } + ], + "source": [ + "# Define label names and create label mappings\n", + "label_list = dataset[\"train\"].features[\"ner_tags\"].feature.names\n", + "label_to_id = {label: i for i, label in enumerate(label_list)}\n", + "id_to_label = {i: label for i, label in enumerate(label_list)}\n", + "\n", + "print(f\"Labels: {label_list}\")\n", + "print(f\"Number of labels: {len(label_list)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 136, + "referenced_widgets": [ + "5caa301d730a4c6f8d4167c62d103995", + "1e52915ef25044b5aeaafb1b3993de2a", + "a3f433e7b9bb4644bfadf5a2e2b80b0f", + "2f535a1c264b463ea3351e5028f5a730", + "f334d38b096b48b4988b3a52489380a5", + "8e85ca027e3b4a36bccb6b00eb1d7610", + "c9be4797e433412f8ded18269ec79a15", + "b33b1828dddb42efb336a03be1986bab", + "b1b42446094e4d90bd39ec7aff2e834d", + "2f10a56770a64bbb9be656b0d441dbff", + "03ff4d8e1baa4db19cc25b5d25169e3a", + "bf13b28f699e421096410756130ee0cb", + "1c2543cacd02423d8d4fe803e5d21551", + "2218fb69a65f487b9e77cf8b9d17531c", + "614d0ebfe2ae49758d3fe1290414425c", + "0757ab90a27e46318c23a37524835da1", + "a0d384b013444141bc42b0fbfabf36b9", + "3856360d462c43dfa238385af2461ff1", + "40b5ecc6c9a84c08a2b90f1479a5a6a8", + "60ad7173d8c14324a90377c28e0feb4c", + "e5821959b1ac49fd89caf3c956c3379f", + "2fafb9e4e7dc49d1ae0d054abe58d862" + ] + }, + "id": "1gxMM1yZuRms", + "outputId": "a421fab5-59b0-4faa-a12d-3ffbc554a7bd" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5caa301d730a4c6f8d4167c62d103995", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "config.json: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bf13b28f699e421096410756130ee0cb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "model.safetensors: 0%| | 0.00/599M [00:00\n", + " \n", + " \n", + " [2634/2634 10:55, Epoch 3/3]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining LossValidation LossAccuracyF1
5000.1179000.1049120.9710290.824140
10000.0398000.0575820.9853980.910766
15000.0278000.0512130.9880260.929259
20000.0123000.0492200.9888830.934884
25000.0113000.0467810.9891550.936273

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=2634, training_loss=0.10961505901931178, metrics={'train_runtime': 668.867, 'train_samples_per_second': 62.977, 'train_steps_per_second': 3.938, 'total_flos': 1577408010395238.0, 'train_loss': 0.10961505901931178, 'epoch': 3.0})" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train the model\n", + "print(\"Starting training...\")\n", + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1rwVhSWyymtS", + "outputId": "c5b27a01-1c4c-4f3d-b511-144fc040075a" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('./modernbert-ner/tokenizer_config.json',\n", + " './modernbert-ner/special_tokens_map.json',\n", + " './modernbert-ner/tokenizer.json')" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Save the model\n", + "trainer.save_model()\n", + "tokenizer.save_pretrained(\"./modernbert-ner\")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "id": "5-vK4SCMuOaV", + "outputId": "dc568d98-1c2c-4de0-cdf8-32def8a55d21" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [204/204 00:16]\n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test results: {'eval_loss': 0.04691711440682411, 'eval_accuracy': 0.9892527549550251, 'eval_f1': 0.9363408521303258, 'eval_runtime': 16.8501, 'eval_samples_per_second': 192.877, 'eval_steps_per_second': 12.107, 'epoch': 3.0}\n", + "Training completed!\n" + ] + } + ], + "source": [ + "# Evaluate on test set\n", + "test_results = trainer.evaluate(tokenized_valid)\n", + "print(f\"Test results: {test_results}\")\n", + "\n", + "print(\"Training completed!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jyCvDUGe8D14", + "outputId": "153b738f-87be-4aba-946f-b4161f5e2d86" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Input text: Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.\n", + "--------------------------------------------------\n", + "Token Label \n", + "--------------------------------------------------\n", + "[CLS] O \n", + "Amazon B-ORG \n", + "Ġfounder O \n", + "ĠJeff B-PER \n", + "ĠBe I-PER \n", + "zos I-PER \n", + ", O \n", + "Ġsour B-PER \n", + "ish O \n", + "Ġand O \n", + "ĠTesla B-ORG \n", + "ĠCEO O \n", + "ĠEl B-PER \n", + "on I-PER \n", + "ĠMusk I-PER \n", + "Ġattended O \n", + "Ġthe O \n", + "ĠWorld B-MISC \n", + "ĠEconomic I-MISC \n", + "ĠForum I-MISC \n", + "Ġin O \n", + "ĠDav B-LOC \n", + "os I-LOC \n", + ", O \n", + "ĠSwitzerland B-LOC \n", + ", O \n", + "Ġwhere O \n", + "Ġthey O \n", + "Ġdiscussed O \n", + "Ġspace O \n", + "Ġexploration O \n", + "Ġwith O \n", + "ĠNASA B-ORG \n", + "Ġadministrator O \n", + "ĠBill B-PER \n", + "ĠNelson I-PER \n", + "Ġand O \n", + "ĠEuropean B-ORG \n", + "ĠSpace I-ORG \n", + "ĠAgency I-ORG \n", + "Ġdirector O \n", + "ĠJose B-PER \n", + "f I-PER \n", + "ĠAs I-PER \n", + "ch I-PER \n", + "b I-PER \n", + "acher O \n", + ". O \n", + "[SEP] I-ORG \n" + ] + } + ], + "source": [ + "# Inference on new text\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "test_text = \"Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.\"\n", + "inputs = tokenizer(test_text, return_tensors=\"pt\", truncation=True, padding=True)\n", + "inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to same device as model\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=2)\n", + " tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n", + " predicted_labels = [id_to_label[pred.item()] for pred in predictions[0]]\n", + "\n", + " # Better aligned output\n", + " print(f\"\\nInput text: {test_text}\")\n", + " print(\"-\" * 50)\n", + " print(f\"{'Token':<15} {'Label':<10}\")\n", + " print(\"-\" * 50)\n", + " for token, label in zip(tokens, predicted_labels):\n", + " print(f\"{token:<15} {label:<10}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-hLU3syz8hbO" + }, + "outputs": [], + "source": [ + "# ignore first and last special characters" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 232, + "referenced_widgets": [ + "552808a885b147f9b945f4fb661b3c94", + "d06d2c1c2e7b4d98b8464dd452e373a5", + "48f27f37f3c64a1d8b1e4679402320d7", + "e1733124803e48f398f8a7951ec972f7", + "0cd18b7563794dd2aded8fea834c9b58", + "572c2b24bcda4a33ad8ee3cb45bac2e2", + "8e0f7aee23d243f9a1b20d299d0075c4", + "b273795dced4410f936e5cfdcf288cfe", + "d9ea2a76fa1a45ed871cda69a72527c2", + "6fddd60c04434dbcaf8905359bae436e", + "3c460bd2ed144a49a0bfbe5a99ba8dd7", + "c7f801b792e946d8b71d0c5f1bd3230d", + "f1a30858927f408f8f7331c0d842b221", + "9c2b1fd350ad47a698329ed3e042cbd3", + "82a1ebbb56fd42739546c47ccad411b6", + "103f36fc42ad4db0b2588db45e2a5e83", + "220018db6bce4a2b99804df65ccbbf51", + "cf5ceb2ec5404b83ab794e358b1acc8d", + "e2f09149d382493cb1549972f5c5f314", + "509aea4f1e2f4b9c94f328f70328dae8", + "b555119b90994539aa97af2992f09f53", + "8b1ff1a1f9254e7bbbe34ea9be5ec97f", + "bcaa9e2e52064479bd21047c4dd1a7a6", + "29efbd23d32a4bea96f84de9143c3eaf", + "e3c751edbfe94ce08d44f66b42874eaf", + "ca8c6bdb436e4f958fbb9c5597f3c11f", + "6734bb636eaf40db881f0ed6655f50b2", + "c0c74f0d5e644aa992d13fc27b230f9a", + "f298b7e1c2f142ef8e1b16004331dc3b", + "bc2544a2ca0f4c738fc5c4601feb1be2", + "3589800cdafc489996d69cf101dca889", + "373253dd14cc438eb7e24bf7b830c5c3", + "27cb4b02c95345028658a8b7853b548c", + "94ac03f6736e441397e84430c37d6bd4", + "2b040bfe6b2340e78a4fa4fcecd13791", + "0d5a01c067f74604ba4166560367eb55", + "ef5177a4b729434bb4c6aa588997f8f2", + "e32dc3643bea4a86bc351905ac4a4cb8", + "a9aaf60aabf24f69a9bc06c0e2fcb09a", + "0d72626fb9cf4acaadfcab09b055f8de", + "0ce5c2a63c59432caa957765bcfdc8d3", + "fef7129b003b4762966928dfc5ddf5a6", + "ce2c0ab9d08f433eab8f0f06011c211d", + "75b8e9fbdad44b19854ec2cf528184e1", + "746f3beb4c29469096262b7ea514ec21", + "289059cebe834a75b0ae67c54d4ebce4", + "e4645f02e5b24c368f15863b88336426", + "86efa3c5153b4ef1be290540ba96e14e", + "ea4e3cae81834495b90493266ffa1f13", + "109c6adae1f04a2e971a3c2fd4062af8", + "27450f4ff4154947a8d4d82354476200", + "b16343a3385f425192bb498fa2666f81", + "4889aaab980041b9a559c6ebd23049e5", + "aa8ab04168be4473a4141f1b91ccbf25", + "b4e44aeb79b14a5797258757ee13134d" + ] + }, + "id": "SB4RTqAN-27M", + "outputId": "5f799cc4-61c3-44f3-d80c-1659ead5ffa1" + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "552808a885b147f9b945f4fb661b3c94", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c7f801b792e946d8b71d0c5f1bd3230d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer.json: 0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bcaa9e2e52064479bd21047c4dd1a7a6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "special_tokens_map.json: 0%| | 0.00/694 [00:00 Date: Tue, 1 Jul 2025 11:58:06 +0530 Subject: [PATCH 2/2] [FXD] notebook : Github colab notebook rendering --- examples/ModernBERT_NER.ipynb | 4600 ++------------------------------- 1 file changed, 201 insertions(+), 4399 deletions(-) diff --git a/examples/ModernBERT_NER.ipynb b/examples/ModernBERT_NER.ipynb index 6b01e999..2a0e441a 100644 --- a/examples/ModernBERT_NER.ipynb +++ b/examples/ModernBERT_NER.ipynb @@ -1,162 +1,56 @@ { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, "cells": [ { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "N1_GgA-_uQRb" - }, - "outputs": [], "source": [ "# !pip install transformers datasets torch accelerate seqeval\n", "# For GPU training\n", "# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118" - ] + ], + "metadata": { + "id": "N1_GgA-_uQRb" + }, + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Vs4d2KX8xOtd", - "outputId": "f40c57b5-9cac-4ab3-85d2-63d279605c3f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting seqeval\n", - " Downloading seqeval-1.2.2.tar.gz (43 kB)\n", - "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/43.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: numpy>=1.14.0 in /usr/local/lib/python3.11/dist-packages (from seqeval) (2.0.2)\n", - "Requirement already satisfied: scikit-learn>=0.21.3 in /usr/local/lib/python3.11/dist-packages (from seqeval) (1.6.1)\n", - "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.15.3)\n", - "Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.21.3->seqeval) (1.5.1)\n", - "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.21.3->seqeval) (3.6.0)\n", - "Building wheels for collected packages: seqeval\n", - " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=b6d61b8ca1a1a12910469b2e683be4cae17bcb2d9febe655ae1ad4faacab144a\n", - " Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead\n", - "Successfully built seqeval\n", - "Installing collected packages: seqeval\n", - "Successfully installed seqeval-1.2.2\n" - ] - } - ], "source": [ "!pip install seqeval" - ] + ], + "metadata": { + "id": "Vs4d2KX8xOtd" + }, + "execution_count": 1, + "outputs": [] }, { "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "IPJ3CXlIxlDC", - "outputId": "8bdd48dc-8c83-4182-bf53-c37f72ec4c6c" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: datasets in /usr/local/lib/python3.11/dist-packages (2.14.4)\n", - "Collecting datasets\n", - " Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from datasets) (3.18.0)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.0.2)\n", - "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (18.1.0)\n", - "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.3.7)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets) (2.2.2)\n", - "Requirement already satisfied: requests>=2.32.2 in /usr/local/lib/python3.11/dist-packages (from datasets) (2.32.3)\n", - "Requirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.11/dist-packages (from datasets) (4.67.1)\n", - "Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets) (3.5.0)\n", - "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.70.15)\n", - "Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)\n", - " Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)\n", - "Requirement already satisfied: huggingface-hub>=0.24.0 in /usr/local/lib/python3.11/dist-packages (from datasets) (0.33.0)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets) (24.2)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from datasets) (6.0.2)\n", - "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.11/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.11.15)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (4.14.0)\n", - "Requirement already satisfied: hf-xet<2.0.0,>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub>=0.24.0->datasets) (1.1.5)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.4.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (3.10)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2.4.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.32.2->datasets) (2025.6.15)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2.9.0.post0)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.2)\n", - "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets) (2025.2)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.3.2)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (25.3.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.7.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.4.4)\n", - "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.3.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.20.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", - "Downloading datasets-3.6.0-py3-none-any.whl (491 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m491.5/491.5 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m193.6/193.6 kB\u001b[0m \u001b[31m18.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: fsspec, datasets\n", - " Attempting uninstall: fsspec\n", - " Found existing installation: fsspec 2025.3.2\n", - " Uninstalling fsspec-2025.3.2:\n", - " Successfully uninstalled fsspec-2025.3.2\n", - " Attempting uninstall: datasets\n", - " Found existing installation: datasets 2.14.4\n", - " Uninstalling datasets-2.14.4:\n", - " Successfully uninstalled datasets-2.14.4\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cuda-runtime-cu12 12.5.82 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cufft-cu12 11.2.3.61 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-curand-cu12==10.3.5.147; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-curand-cu12 10.3.6.82 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-cusolver-cu12==11.6.1.9; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cusolver-cu12 11.6.3.83 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-cusparse-cu12==12.3.1.170; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-cusparse-cu12 12.5.1.3 which is incompatible.\n", - "torch 2.6.0+cu124 requires nvidia-nvjitlink-cu12==12.4.127; platform_system == \"Linux\" and platform_machine == \"x86_64\", but you have nvidia-nvjitlink-cu12 12.5.82 which is incompatible.\n", - "gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed datasets-3.6.0 fsspec-2025.3.0\n" - ] - }, - { - "data": { - "application/vnd.colab-display-data+json": { - "id": "a2623491c78b4d0a82872ef849fec6b9", - "pip_warning": { - "packages": [ - "datasets", - "fsspec" - ] - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "!pip install -U datasets\n" - ] + ], + "metadata": { + "id": "IPJ3CXlIxlDC" + }, + "execution_count": 2, + "outputs": [] }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "l3o0vdx1uRbZ" - }, - "outputs": [], "source": [ "import torch\n", "from datasets import load_dataset, Dataset\n", @@ -169,11 +63,21 @@ ")\n", "from seqeval.metrics import accuracy_score, classification_report, f1_score\n", "import numpy as np\n" - ] + ], + "metadata": { + "id": "l3o0vdx1uRbZ" + }, + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": 2, + "source": [ + "\n", + "# Load ModernBERT tokenizer and model\n", + "model_name = \"answerdotai/ModernBERT-base\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)" + ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -181,10 +85,11 @@ "id": "LCdaBaN2uRd2", "outputId": "4413ecc2-2066-4f1a-eba2-ba89ee147f01" }, + "execution_count": null, "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", @@ -194,89 +99,27 @@ " warnings.warn(\n" ] } - ], - "source": [ - "\n", - "# Load ModernBERT tokenizer and model\n", - "model_name = \"answerdotai/ModernBERT-base\"\n", - "tokenizer = AutoTokenizer.from_pretrained(model_name)" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 81, - "referenced_widgets": [ - "1c78ee94d28249c0b1ae757fce2b5d7b", - "f5b52e204b02491a8b606d73e043bd7f", - "6b27d8dfc81240439d8b50b1002efc92", - "195be4c448934cef922fb3af150aa389", - "dd89dac65d7b453da161009c88774d0b", - "12f722114e714af5a1138fd8605a2caa", - "02c589e29a2b4eb7a0692a262f371ddf", - "3f45bc7ba75843e1b0f6d0350e67a47f", - "a14305090ba0480e8c13ce3c0edaca8a", - "4b01b4ba50a240d3acd895b1dab7bdf7", - "fffeed9a91e24364b95d615debd1a68d", - "33f2b12bda244ccf81488963a7c1d45e", - "064c228d94214dbea5de716e2d9c8cad", - "57278ac9a4ea48d4b1a40b9e7006efb6", - "116bb67200f642a2bd9f46ae7fda9e69", - "669eb6af91b5415f96e14d852e79f26b", - "451fff47e8834615942477cebb979668", - "480e7e4afd0c4fe3a3ea4e67b7e67d4a", - "fb6a861f867c4516b4c891ace00e9081", - "a7e2337a4bb84124b1604d0bb7aeceee", - "18b5b0b98ec24b8e977be8a153b73c2a", - "3bd7a4fc2a804992a263b613d1bc982e" - ] - }, - "id": "4qUTkIOJuRgc", - "outputId": "6be687d4-6481-48b9-e628-a7a5d1775ff0" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1c78ee94d28249c0b1ae757fce2b5d7b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "README.md: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "33f2b12bda244ccf81488963a7c1d45e", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "conll2003.py: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ "\n", "# Example: Load CoNLL-2003 dataset for NER\n", "# You can replace this with your own dataset\n", "dataset = load_dataset(\"conll2003\")" - ] + ], + "metadata": { + "id": "4qUTkIOJuRgc" + }, + "execution_count": 3, + "outputs": [] }, { "cell_type": "code", - "execution_count": 4, + "source": [ + "dataset" + ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -284,8 +127,10 @@ "id": "A_oLinbVuRii", "outputId": "06ccf7fc-f134-40e3-df2d-8c4957d9eb48" }, + "execution_count": null, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "DatasetDict({\n", @@ -304,18 +149,22 @@ "})" ] }, - "execution_count": 4, "metadata": {}, - "output_type": "execute_result" + "execution_count": 4 } - ], - "source": [ - "dataset" ] }, { "cell_type": "code", - "execution_count": 5, + "source": [ + "# Define label names and create label mappings\n", + "label_list = dataset[\"train\"].features[\"ner_tags\"].feature.names\n", + "label_to_id = {label: i for i, label in enumerate(label_list)}\n", + "id_to_label = {i: label for i, label in enumerate(label_list)}\n", + "\n", + "print(f\"Labels: {label_list}\")\n", + "print(f\"Number of labels: {len(label_list)}\")" + ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -323,99 +172,20 @@ "id": "eG81u-sOuRkp", "outputId": "99b2a3dc-af98-45d0-fcf8-baefceaf6208" }, + "execution_count": null, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']\n", "Number of labels: 9\n" ] } - ], - "source": [ - "# Define label names and create label mappings\n", - "label_list = dataset[\"train\"].features[\"ner_tags\"].feature.names\n", - "label_to_id = {label: i for i, label in enumerate(label_list)}\n", - "id_to_label = {i: label for i, label in enumerate(label_list)}\n", - "\n", - "print(f\"Labels: {label_list}\")\n", - "print(f\"Number of labels: {len(label_list)}\")" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 136, - "referenced_widgets": [ - "5caa301d730a4c6f8d4167c62d103995", - "1e52915ef25044b5aeaafb1b3993de2a", - "a3f433e7b9bb4644bfadf5a2e2b80b0f", - "2f535a1c264b463ea3351e5028f5a730", - "f334d38b096b48b4988b3a52489380a5", - "8e85ca027e3b4a36bccb6b00eb1d7610", - "c9be4797e433412f8ded18269ec79a15", - "b33b1828dddb42efb336a03be1986bab", - "b1b42446094e4d90bd39ec7aff2e834d", - "2f10a56770a64bbb9be656b0d441dbff", - "03ff4d8e1baa4db19cc25b5d25169e3a", - "bf13b28f699e421096410756130ee0cb", - "1c2543cacd02423d8d4fe803e5d21551", - "2218fb69a65f487b9e77cf8b9d17531c", - "614d0ebfe2ae49758d3fe1290414425c", - "0757ab90a27e46318c23a37524835da1", - "a0d384b013444141bc42b0fbfabf36b9", - "3856360d462c43dfa238385af2461ff1", - "40b5ecc6c9a84c08a2b90f1479a5a6a8", - "60ad7173d8c14324a90377c28e0feb4c", - "e5821959b1ac49fd89caf3c956c3379f", - "2fafb9e4e7dc49d1ae0d054abe58d862" - ] - }, - "id": "1gxMM1yZuRms", - "outputId": "a421fab5-59b0-4faa-a12d-3ffbc554a7bd" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5caa301d730a4c6f8d4167c62d103995", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "config.json: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bf13b28f699e421096410756130ee0cb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "model.safetensors: 0%| | 0.00/599M [00:00" + ], "text/html": [ "\n", "
\n", @@ -738,34 +464,29 @@ " \n", " \n", "

" - ], - "text/plain": [ - "" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "execute_result", "data": { "text/plain": [ "TrainOutput(global_step=2634, training_loss=0.10961505901931178, metrics={'train_runtime': 668.867, 'train_samples_per_second': 62.977, 'train_steps_per_second': 3.938, 'total_flos': 1577408010395238.0, 'train_loss': 0.10961505901931178, 'epoch': 3.0})" ] }, - "execution_count": 40, "metadata": {}, - "output_type": "execute_result" + "execution_count": 40 } - ], - "source": [ - "# Train the model\n", - "print(\"Starting training...\")\n", - "trainer.train()" ] }, { "cell_type": "code", - "execution_count": 41, + "source": [ + "# Save the model\n", + "trainer.save_model()\n", + "tokenizer.save_pretrained(\"./modernbert-ner\")" + ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -773,8 +494,10 @@ "id": "1rwVhSWyymtS", "outputId": "c5b27a01-1c4c-4f3d-b511-144fc040075a" }, + "execution_count": null, "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "('./modernbert-ner/tokenizer_config.json',\n", @@ -782,20 +505,14 @@ " './modernbert-ner/tokenizer.json')" ] }, - "execution_count": 41, "metadata": {}, - "output_type": "execute_result" + "execution_count": 41 } - ], - "source": [ - "# Save the model\n", - "trainer.save_model()\n", - "tokenizer.save_pretrained(\"./modernbert-ner\")" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -806,7 +523,11 @@ }, "outputs": [ { + "output_type": "display_data", "data": { + "text/plain": [ + "" + ], "text/html": [ "\n", "

\n", @@ -815,17 +536,13 @@ " [204/204 00:16]\n", "
\n", " " - ], - "text/plain": [ - "" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Test results: {'eval_loss': 0.04691711440682411, 'eval_accuracy': 0.9892527549550251, 'eval_f1': 0.9363408521303258, 'eval_runtime': 16.8501, 'eval_samples_per_second': 192.877, 'eval_steps_per_second': 12.107, 'epoch': 3.0}\n", "Training completed!\n" @@ -842,7 +559,27 @@ }, { "cell_type": "code", - "execution_count": 59, + "source": [ + "# Inference on new text\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "test_text = \"Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.\"\n", + "inputs = tokenizer(test_text, return_tensors=\"pt\", truncation=True, padding=True)\n", + "inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to same device as model\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=2)\n", + " tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n", + " predicted_labels = [id_to_label[pred.item()] for pred in predictions[0]]\n", + "\n", + " # Better aligned output\n", + " print(f\"\\nInput text: {test_text}\")\n", + " print(\"-\" * 50)\n", + " print(f\"{'Token':<15} {'Label':<10}\")\n", + " print(\"-\" * 50)\n", + " for token, label in zip(tokens, predicted_labels):\n", + " print(f\"{token:<15} {label:<10}\")" + ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -850,10 +587,11 @@ "id": "jyCvDUGe8D14", "outputId": "153b738f-87be-4aba-946f-b4161f5e2d86" }, + "execution_count": null, "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "\n", "Input text: Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.\n", @@ -911,194 +649,21 @@ "[SEP] I-ORG \n" ] } - ], - "source": [ - "# Inference on new text\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", - "model.to(device)\n", - "test_text = \"Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.\"\n", - "inputs = tokenizer(test_text, return_tensors=\"pt\", truncation=True, padding=True)\n", - "inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to same device as model\n", - "with torch.no_grad():\n", - " outputs = model(**inputs)\n", - " predictions = torch.argmax(outputs.logits, dim=2)\n", - " tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n", - " predicted_labels = [id_to_label[pred.item()] for pred in predictions[0]]\n", - "\n", - " # Better aligned output\n", - " print(f\"\\nInput text: {test_text}\")\n", - " print(\"-\" * 50)\n", - " print(f\"{'Token':<15} {'Label':<10}\")\n", - " print(\"-\" * 50)\n", - " for token, label in zip(tokens, predicted_labels):\n", - " print(f\"{token:<15} {label:<10}\")" ] }, { "cell_type": "code", - "execution_count": null, + "source": [ + "# ignore first and last special characters" + ], "metadata": { "id": "-hLU3syz8hbO" }, - "outputs": [], - "source": [ - "# ignore first and last special characters" - ] + "execution_count": null, + "outputs": [] }, { "cell_type": "code", - "execution_count": 60, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 232, - "referenced_widgets": [ - "552808a885b147f9b945f4fb661b3c94", - "d06d2c1c2e7b4d98b8464dd452e373a5", - "48f27f37f3c64a1d8b1e4679402320d7", - "e1733124803e48f398f8a7951ec972f7", - "0cd18b7563794dd2aded8fea834c9b58", - "572c2b24bcda4a33ad8ee3cb45bac2e2", - "8e0f7aee23d243f9a1b20d299d0075c4", - "b273795dced4410f936e5cfdcf288cfe", - "d9ea2a76fa1a45ed871cda69a72527c2", - "6fddd60c04434dbcaf8905359bae436e", - "3c460bd2ed144a49a0bfbe5a99ba8dd7", - "c7f801b792e946d8b71d0c5f1bd3230d", - "f1a30858927f408f8f7331c0d842b221", - "9c2b1fd350ad47a698329ed3e042cbd3", - "82a1ebbb56fd42739546c47ccad411b6", - "103f36fc42ad4db0b2588db45e2a5e83", - "220018db6bce4a2b99804df65ccbbf51", - "cf5ceb2ec5404b83ab794e358b1acc8d", - "e2f09149d382493cb1549972f5c5f314", - "509aea4f1e2f4b9c94f328f70328dae8", - "b555119b90994539aa97af2992f09f53", - "8b1ff1a1f9254e7bbbe34ea9be5ec97f", - "bcaa9e2e52064479bd21047c4dd1a7a6", - "29efbd23d32a4bea96f84de9143c3eaf", - "e3c751edbfe94ce08d44f66b42874eaf", - "ca8c6bdb436e4f958fbb9c5597f3c11f", - "6734bb636eaf40db881f0ed6655f50b2", - "c0c74f0d5e644aa992d13fc27b230f9a", - "f298b7e1c2f142ef8e1b16004331dc3b", - "bc2544a2ca0f4c738fc5c4601feb1be2", - "3589800cdafc489996d69cf101dca889", - "373253dd14cc438eb7e24bf7b830c5c3", - "27cb4b02c95345028658a8b7853b548c", - "94ac03f6736e441397e84430c37d6bd4", - "2b040bfe6b2340e78a4fa4fcecd13791", - "0d5a01c067f74604ba4166560367eb55", - "ef5177a4b729434bb4c6aa588997f8f2", - "e32dc3643bea4a86bc351905ac4a4cb8", - "a9aaf60aabf24f69a9bc06c0e2fcb09a", - "0d72626fb9cf4acaadfcab09b055f8de", - "0ce5c2a63c59432caa957765bcfdc8d3", - "fef7129b003b4762966928dfc5ddf5a6", - "ce2c0ab9d08f433eab8f0f06011c211d", - "75b8e9fbdad44b19854ec2cf528184e1", - "746f3beb4c29469096262b7ea514ec21", - "289059cebe834a75b0ae67c54d4ebce4", - "e4645f02e5b24c368f15863b88336426", - "86efa3c5153b4ef1be290540ba96e14e", - "ea4e3cae81834495b90493266ffa1f13", - "109c6adae1f04a2e971a3c2fd4062af8", - "27450f4ff4154947a8d4d82354476200", - "b16343a3385f425192bb498fa2666f81", - "4889aaab980041b9a559c6ebd23049e5", - "aa8ab04168be4473a4141f1b91ccbf25", - "b4e44aeb79b14a5797258757ee13134d" - ] - }, - "id": "SB4RTqAN-27M", - "outputId": "5f799cc4-61c3-44f3-d80c-1659ead5ffa1" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "552808a885b147f9b945f4fb661b3c94", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "tokenizer_config.json: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "c7f801b792e946d8b71d0c5f1bd3230d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "tokenizer.json: 0.00B [00:00, ?B/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bcaa9e2e52064479bd21047c4dd1a7a6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "special_tokens_map.json: 0%| | 0.00/694 [00:00