diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb
deleted file mode 100644
index 7fec515..0000000
--- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "cells": [],
- "metadata": {},
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/BART/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/BART/.ipynb_checkpoints/Untitled-checkpoint.ipynb
deleted file mode 100644
index 7fec515..0000000
--- a/BART/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "cells": [],
- "metadata": {},
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/BART/.ipynb_checkpoints/Untitled1-checkpoint.ipynb b/BART/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
deleted file mode 100644
index 7fec515..0000000
--- a/BART/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
+++ /dev/null
@@ -1,6 +0,0 @@
-{
- "cells": [],
- "metadata": {},
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/BART/Untitled.ipynb b/BART/Untitled.ipynb
deleted file mode 100644
index f0dab22..0000000
--- a/BART/Untitled.ipynb
+++ /dev/null
@@ -1,203 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "import torch.nn as nn"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class MultiHeadAttention(nn.Module):\n",
-    "    \n",
-    "    def __init__(\n",
-    "        self,\n",
-    "        d_model: int,\n",
-    "        num_heads: int,\n",
-    "        dropout: float = 0.0,\n",
-    "        is_decoder: bool = False,\n",
-    "        bias: bool = True\n",
-    "    ):\n",
-    "        super().__init__()\n",
-    "        self.d_model = d_model\n",
-    "        self.num_heads = \n",
-    "        self.W_q = nn.Linear(d_model, d_model, bias)\n",
-    "        self.W_k = nn.Linear(d_model, d_model, bias)\n",
-    "        self.W_v = nn.Linear(d_model, d_model, bias)\n",
-    "        \n",
-    "    def forward(\n",
-    "        self, \n",
-    "        hidden_states: torch.Tensor, \n",
-    "        encoder_hidden_states: torch.Tensor = None,\n",
-    "        \n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):\n",
-    "    \"\"\"\n",
-    "    Make causal mask used for bi-directional self-attention.\n",
-    "    \"\"\"\n",
-    "    bsz, tgt_len = input_ids_shape\n",
-    "    mask = torch.full((tgt_len, tgt_len), float(\"-inf\"))\n",
-    "    mask_cond = torch.arange(mask.size(-1))\n",
-    "    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)\n",
-    "    mask = mask.to(dtype)\n",
-    "\n",
-    "    if past_key_values_length > 0:\n",
-    "        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)\n",
-    "    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):\n",
-    "    \"\"\"\n",
-    "    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.\n",
-    "    \"\"\"\n",
-    "    bsz, src_len = mask.size()\n",
-    "    tgt_len = tgt_len if tgt_len is not None else src_len\n",
-    "\n",
-    "    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)\n",
-    "\n",
-    "    inverted_mask = 1.0 - expanded_mask\n",
-    "\n",
-    "    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import BartModel, BartConfig"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "config = BartConfig()\n",
-    "bart = BartModel(config)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config.pad_token_id"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "attention_mask, input_ids = bart.dummy_inputs.values()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "-3.4028234663852886e+38\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]]],\n",
-       "\n",
-       "\n",
-       "        [[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38],\n",
-       "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00, -3.4028e+38]]]])"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bsz, src_len = attention_mask.size()\n",
-    "tgt_len = 7\n",
-    "dtype = torch.nn.Embedding(10000, 10)(input_ids).dtype\n",
-    "expand_mask = attention_mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len)\n",
-    "inverted_mask = 1.0 - expand_mask.to(dtype)\n",
-    "print(torch.finfo(dtype).min)\n",
-    "inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "basic",
-   "language": "python",
-   "name": "basic"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/BART/Untitled1.ipynb b/BART/Untitled1.ipynb
deleted file mode 100644
index 04ddbbc..0000000
--- a/BART/Untitled1.ipynb
+++ /dev/null
@@ -1,2310 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# BARTModel 분석"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import math\n",
-    "import random\n",
-    "import warnings\n",
-    "from typing import Any, Dict, Tuple, Union, Optional, List\n",
-    "\n",
-    "import numpy as np\n",
-    "from overrides import overrides\n",
-    "\n",
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "import torch.utils.checkpoint\n",
-    "import torch.nn.functional as F\n",
-    "\n",
-    "import transformers\n",
-    "from transformers import BartConfig, BartTokenizer, BartModel\n",
-    "from transformers.models.bart.modeling_bart import BartEncoder, BartDecoder\n",
-    "from transformers.utils import logging\n",
-    "from transformers.modeling_utils import PreTrainedModel"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "torch.__version__ == 1.7.1+cu110\n",
-      "4.2.1\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"torch.__version__ == {torch.__version__}\")\n",
-    "print(transformers.__version__)\n",
-    "print(torch.cuda.is_available())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "config = BartConfig()\n",
-    "bart = BartModel(config)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## BartModel.\\_\\_init\\_\\_\n",
-    "- BartPretrainedModel을 상속받음"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(1, 50265)"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.config.pad_token_id, bart.config.vocab_size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Embedding(50265, 1024, padding_idx=1)"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.shared # torch.nn.Embedding"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- `BartPretrainedModel`의 init_weight 메서드 실시, 뜯어보자\n",
-    "- `nn.Linear`, `nn.Embedding`의 경우 config의 std로 초기값 조정"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class BartPretrainedModel(PreTrainedModel):\n",
-    "    config_class = BartConfig\n",
-    "    base_model_prefix = \"model\"\n",
-    "\n",
-    "    def _init_weights(self, module):\n",
-    "        std = self.config.init_std\n",
-    "        if isinstance(module, nn.Linear):\n",
-    "            module.weight.data.normal_(mean=0.0, std=std)\n",
-    "            if module.bias is not None:\n",
-    "                module.bias.data.zero_()\n",
-    "        elif isinstance(module, BartSinusoidalPositionalEmbedding):\n",
-    "            pass\n",
-    "        elif isinstance(module, nn.Embedding):\n",
-    "            module.weight.data.normal_(mean=0.0, std=std)\n",
-    "            if module.padding_idx is not None:\n",
-    "                module.weight.data[module.padding_idx].zero_()\n",
-    "\n",
-    "    @property\n",
-    "    def dummy_inputs(self):\n",
-    "        pad_token = self.config.pad_token_id\n",
-    "        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)\n",
-    "        dummy_inputs = {\n",
-    "            \"attention_mask\": input_ids.ne(pad_token),\n",
-    "            \"input_ids\": input_ids,\n",
-    "        }\n",
-    "        return dummy_inputs"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">### BartEncoder\n",
-    ">- `BartPretrainedModel` 객체를 동일하게 상속받음\n",
-    ">### BartEncoder.\\_\\_init\\_\\_"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "dropout: 0.1,\n",
-      "layerdrop: 0.0\n",
-      "embed_dim: 1024,\n",
-      "embed_scale: 1.0,\n",
-      "padding_idx: 1,\n",
-      "max_source_positions: 1024\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(f\"\"\"\n",
-    "dropout: {bart.encoder.dropout},\n",
-    "layerdrop: {bart.config.encoder_layerdrop}\n",
-    "embed_dim: {bart.config.d_model},\n",
-    "embed_scale: {math.sqrt(bart.config.d_model) if config.scale_embedding else 1.0},\n",
-    "padding_idx: {bart.config.pad_token_id},\n",
-    "max_source_positions: {bart.config.max_position_embeddings}\n",
-    "\"\"\".strip())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Embedding(50265, 1024, padding_idx=1)"
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.encoder.embed_tokens # __init__에서 받아올 수 있음"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "False\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(bart.config.static_position_embeddings) # 21.01.06 commit으로 삭제!\n",
-    "bart.encoder.embed_positions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- config.static_position_embeddings에 따라 어떤 객체를 사용할지 갈림\n",
-    "    - if True, `BartSinusoidalPositionalEmbedding`\n",
-    "    - else: `BartLearnedPositionalEmbedding`\n",
-    "- config.encoder_layers의 수만큼 EncoderLayer를 쌓음"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "12"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config.encoder_layers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BartEncoderLayer(\n",
-       "  (self_attn): BartAttention(\n",
-       "    (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "  )\n",
-       "  (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
-       "  (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
-       "  (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
-       "  (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
-       ")"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.encoder.layers[0] # 이 layer를 12개 쌓음"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- layernorm을 어떻게 적용할지 보자"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"
-     ]
-    }
-   ],
-   "source": [
-    "if config.normalize_embedding:\n",
-    "    print(BartLayerNorm(embed_dim))\n",
-    "else:\n",
-    "    print(nn.Identity())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "None\n"
-     ]
-    }
-   ],
-   "source": [
-    "if config.add_final_layer_norm:\n",
-    "    print(BartLayerNorm(config.d_model))\n",
-    "else:\n",
-    "    print(None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def BartLayerNorm(\n",
-    "    normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True\n",
-    "):\n",
-    "    if torch.cuda.is_available():\n",
-    "        try:\n",
-    "            from apex.normalization import FusedLayerNorm\n",
-    "\n",
-    "            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)\n",
-    "        except ImportError:\n",
-    "            pass\n",
-    "    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 171,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "hidden_states= torch.randn(2, 5, 1024)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 172,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[[-0.7331, -0.4191,  0.7951,  ...,  0.4873, -0.5494, -0.2944],\n",
-       "         [-0.7643,  1.8104, -0.0323,  ..., -0.4546,  0.5776, -0.7373],\n",
-       "         [-1.1619,  1.9948,  0.4805,  ..., -1.0691, -0.7803,  0.6411],\n",
-       "         [ 0.0236,  0.1118, -0.2880,  ..., -1.5818,  0.1992, -0.9446],\n",
-       "         [ 0.3735, -1.4478,  0.8767,  ...,  1.2091, -0.4567,  0.4698]],\n",
-       "\n",
-       "        [[ 0.4276, -1.4758,  0.0165,  ...,  1.9631, -0.1555, -1.0019],\n",
-       "         [ 0.6768, -0.3537,  0.9676,  ..., -1.3469, -0.1781,  1.4861],\n",
-       "         [ 0.5480, -1.0024,  0.4656,  ...,  0.5370, -0.4840,  0.0959],\n",
-       "         [-1.5319,  0.8093, -0.3881,  ..., -0.5653, -0.3972,  0.5072],\n",
-       "         [-1.0625,  0.7415, -0.8247,  ...,  0.1221, -0.1593, -0.0284]]],\n",
-       "       grad_fn=<NativeLayerNormBackward>)"
-      ]
-     },
-     "execution_count": 172,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "nn.LayerNorm(1024)(hidden_states)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 173,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[[-0.7331, -0.4191,  0.7951,  ...,  0.4873, -0.5494, -0.2944],\n",
-       "         [-0.7643,  1.8104, -0.0323,  ..., -0.4546,  0.5776, -0.7373],\n",
-       "         [-1.1619,  1.9948,  0.4805,  ..., -1.0691, -0.7803,  0.6411],\n",
-       "         [ 0.0236,  0.1118, -0.2880,  ..., -1.5819,  0.1992, -0.9446],\n",
-       "         [ 0.3735, -1.4478,  0.8767,  ...,  1.2091, -0.4567,  0.4698]],\n",
-       "\n",
-       "        [[ 0.4276, -1.4758,  0.0165,  ...,  1.9631, -0.1555, -1.0019],\n",
-       "         [ 0.6768, -0.3537,  0.9676,  ..., -1.3469, -0.1781,  1.4862],\n",
-       "         [ 0.5480, -1.0024,  0.4656,  ...,  0.5370, -0.4840,  0.0959],\n",
-       "         [-1.5319,  0.8093, -0.3881,  ..., -0.5653, -0.3972,  0.5072],\n",
-       "         [-1.0625,  0.7415, -0.8247,  ...,  0.1221, -0.1593, -0.0284]]])"
-      ]
-     },
-     "execution_count": 173,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mean = hidden_states.mean(dim=-1)\n",
-    "mean = mean[:, :, None].expand((*mean.size(), 1024))\n",
-    "std = hidden_states.std(dim=-1, unbiased=False)\n",
-    "std = std[:, :, None].expand((*std.size(), 1024))\n",
-    "\n",
-    "(hidden_states - mean) / std"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">### BartEncoder.forward\n",
-    ">#### Ch0. forward의 input\n",
-    ">```python\n",
-    "input_ids            = None,\n",
-    "attention_mask       = None,\n",
-    "inputs_embeds        = None,\n",
-    "output_attentions    = None,\n",
-    "output_hidden_states = None,\n",
-    "return_dict          = None,\n",
-    ">```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">#### Ch1. config으로 input setting"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "False\n",
-      "(False,)\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(config.output_attentions)\n",
-    "print((config.output_hidden_states,))\n",
-    "print(config.use_return_dict)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">- input을 아래의 코드로 처리\n",
-    ">```python\n",
-    "\\# retrieve input_ids and inputs_embeds\n",
-    "if input_ids is not None and inputs_embeds is not None:\n",
-    "    raise ValueError(\"You cannot specify both input_ids and inputs_embeds at the same time\")\n",
-    "elif input_ids is not None:\n",
-    "    input_shape = input_ids.size()\n",
-    "    input_ids = input_ids.view(-1, input_shape[-1])\n",
-    "elif inputs_embeds is not None:\n",
-    "    input_shape = inputs_embeds.size()[:-1]\n",
-    "else:\n",
-    "    raise ValueError(\"You have to specify either input_ids or inputs_embeds\")\n",
-    "if inputs_embeds is None:\n",
-    "    inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale\n",
-    ">```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)"
-      ]
-     },
-     "execution_count": 52,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.encoder.embed_positions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([5, 1024])"
-      ]
-     },
-     "execution_count": 34,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.encoder.embed_positions(bart.dummy_inputs['input_ids'].size()).size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 5])"
-      ]
-     },
-     "execution_count": 31,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">#### 아래의 과정으로 처리\n",
-    ">- input을 embedding (주어진 경우는 그냥 넘어감)\n",
-    ">- input_shape으로 position vector 얻음\n",
-    ">- hidden_states를 input_embeds + embeds_pos로 계산\n",
-    ">- hidden_states를 LayerNorm해주고 Dropout 실시\n",
-    ">- attention_mask가 None이 아니면 아래 코드로 expand (**디코더랑 처리가 조금 다름!**)\n",
-    ">- output_hidden_states, output_attentions이 None이 아니면,\n",
-    "    - () tuple을 주고 None이면 그냥 None"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]=None):\n",
-    "    bsz, src_len = mask.size()\n",
-    "    tgt_len = tgt_len if tgt_len is not None else src_len\n",
-    "    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)\n",
-    "    inverted_mask = 1.0 - expanded_mask\n",
-    "    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">- 그리고 나선, BartEncoderLayer별로 아래의 연산을 수행\n",
-    ">```python\n",
-    "for encoder_layer in self.layers:\n",
-    "    if output_hidden_states:\n",
-    "        encoder_states = encoder_states + (hidden_states,)\n",
-    "    # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)\n",
-    "    dropout_probability = random.uniform(0, 1)\n",
-    "    if self.training and (dropout_probability < self.layerdrop):  # skip the layer\n",
-    "        attn = None\n",
-    "    else:\n",
-    "        hidden_states, attn = encoder_layer(\n",
-    "            hidden_states, \n",
-    "            attention_mask, \n",
-    "            output_attentions=output_attentions\n",
-    "        )\n",
-    "    if output_attentions:\n",
-    "        all_attentions = all_attentions + (attn,)\n",
-    ">```\n",
-    ">- 그 다음, layer_norm이 None이 아니면 hidden_states를 layer normalization\n",
-    ">- output_hidden_states가 None이 아니면, encoder_states에 (hidden_states,)를 더해줌\n",
-    ">- return_dict가 True인지 False인지에 따라 출력 결과물이 달라짐\n",
-    "    - `False`: \n",
-    "    ```python \n",
-    "    tuple(\n",
-    "        v for v in [\n",
-    "            hidden_states, encoder_states, all_attentions\n",
-    "        ] if v is not None\n",
-    "    )\n",
-    "    ```\n",
-    "    - `True`:\n",
-    "    ```python\n",
-    "    BaseModelOutput(\n",
-    "        last_hidden_state=hidden_states, \n",
-    "        hidden_states=encoder_states, \n",
-    "        attentions=all_attentions\n",
-    "    )\n",
-    "    ```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">### BartDecoder\n",
-    ">- `BartPretrainedModel` 객체를 동일하게 상속받음\n",
-    ">### BartDecoder.\\_\\_init\\_\\_"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 62,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "dropout: 0.1\n",
-      "layerdrop: 0.0\n",
-      "embed_dim: 1024,\n",
-      "embed_scale: 1.0,\n",
-      "padding_idx: 1,\n",
-      "max_source_positions: 1024\n"
-     ]
-    }
-   ],
-   "source": [
-    "dropout = config.dropout\n",
-    "layerdrop = config.decoder_layerdrop\n",
-    "\n",
-    "embed_dim = config.d_model\n",
-    "embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0\n",
-    "padding_idx = config.pad_token_id\n",
-    "max_source_positions = config.max_position_embeddings\n",
-    "\n",
-    "print(f\"\"\"\n",
-    "dropout: {dropout}\n",
-    "layerdrop: {layerdrop}\n",
-    "embed_dim: {embed_dim},\n",
-    "embed_scale: {embed_scale},\n",
-    "padding_idx: {padding_idx},\n",
-    "max_source_positions: {max_source_positions}\n",
-    "\"\"\".strip())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 63,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 63,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Decoder 차별점\n",
-    "do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm  # layernorm variant\n",
-    "do_blenderbot_90_layernorm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embed_tokens: Optional[nn.Embedding] = None\n",
-    "\n",
-    "# None이면\n",
-    "embed_tokens = nn.Embedding(config.vocab_size, embed_dim, padding_idx)\n",
-    "\n",
-    "# None이 아니면\n",
-    "embed_tokens = embed_tokens"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- config.static_position_embeddings에 따라 어떤 객체를 사용할지 갈림\n",
-    "    - if True, `BartSinusoidalPositionalEmbedding`\n",
-    "    - else: `BartLearnedPositionalEmbedding`\n",
-    "- config.encoder_layers의 수만큼 EncoderLayer를 쌓음"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "12"
-      ]
-     },
-     "execution_count": 65,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config.decoder_layers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "BartDecoderLayer(\n",
-       "  (self_attn): BartAttention(\n",
-       "    (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "  )\n",
-       "  (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
-       "  (encoder_attn): BartAttention(\n",
-       "    (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "    (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
-       "  )\n",
-       "  (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
-       "  (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
-       "  (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
-       "  (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
-       ")"
-      ]
-     },
-     "execution_count": 66,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.decoder.layers[0] # 이 layer를 12개 쌓음"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 67,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n"
-     ]
-    }
-   ],
-   "source": [
-    "if config.normalize_embedding:\n",
-    "    print(BartLayerNorm(embed_dim)) # config.d_model과 동일\n",
-    "else:\n",
-    "    print(nn.Identity())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 68,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "None\n"
-     ]
-    }
-   ],
-   "source": [
-    "if config.add_final_layer_norm:\n",
-    "    print(BartLayerNorm(config.d_model))\n",
-    "else:\n",
-    "    print(None)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">### BartDecoder.forward\n",
-    ">#### Ch0. forward의 input\n",
-    ">```python\n",
-    "input_ids=None,\n",
-    "attention_mask=None,\n",
-    "encoder_hidden_states=None,\n",
-    "encoder_attention_mask=None,\n",
-    "past_key_values=None,\n",
-    "inputs_embeds=None,\n",
-    "use_cache=None,\n",
-    "output_attentions=None,\n",
-    "output_hidden_states=None,\n",
-    "return_dict=None,\n",
-    ">```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "False\n",
-      "(False,)\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(config.output_attentions)\n",
-    "print((config.output_hidden_states,))\n",
-    "print(config.use_cache) # 차이점\n",
-    "print(config.use_return_dict)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">- input을 아래의 코드로 처리\n",
-    ">```python\n",
-    "\\# retrieve input_ids and inputs_embeds\n",
-    "if input_ids is not None and inputs_embeds is not None:\n",
-    "    raise ValueError(\"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time\")\n",
-    "elif input_ids is not None:\n",
-    "    input_shape = input_ids.size()\n",
-    "    input_ids = input_ids.view(-1, input_shape[-1])\n",
-    "elif inputs_embeds is not None:\n",
-    "    input_shape = inputs_embeds.size()[:-1]\n",
-    "else:\n",
-    "    raise ValueError(\"You have to specify either decoder_input_ids or decoder_inputs_embeds\")\n",
-    "if inputs_embeds is None:\n",
-    "    inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale\n",
-    ">```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- Decoder 차이점"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 70,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# past_key_values_length\n",
-    "past_key_values = None\n",
-    "past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0\n",
-    "past_key_values_length"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- Decoder 차이점"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 71,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "attention_mask = bart.dummy_inputs['attention_mask']\n",
-    "input_shape = bart.dummy_inputs['input_ids'].size()\n",
-    "input_ids = bart.dummy_inputs['input_ids'].view(-1, input_shape[-1])\n",
-    "inputs_embeds = embed_tokens(input_ids) * embed_scale "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):\n",
-    "    \"\"\"\n",
-    "    Make causal mask used for bi-directional self-attention.\n",
-    "    \"\"\"\n",
-    "    bsz, tgt_len = input_ids_shape\n",
-    "    mask = torch.full((tgt_len, tgt_len), float(\"-inf\"))\n",
-    "    mask_cond = torch.arange(mask.size(-1))\n",
-    "    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)\n",
-    "    mask = mask.to(dtype)\n",
-    "\n",
-    "    if past_key_values_length > 0:\n",
-    "        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)\n",
-    "    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Attentoin Mask 처리\n",
-    "\n",
-    "# Create causal mask\n",
-    "# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, srxc_seq_len]\n",
-    "combined_attention_mask = None\n",
-    "if input_shape[-1] > 1: # 걍 무조건 하는거나 다름없음\n",
-    "    combined_attention_mask = _make_causal_mask(\n",
-    "        input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 1, 5, 5])"
-      ]
-     },
-     "execution_count": 84,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "combined_attention_mask.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# create decoder_padding_mask if not provided and needed\n",
-    "# 4.12.20 (PVP): Not a fan of this \"magical\" function that\n",
-    "# automatically creates attention_mask for padded tokens\n",
-    "# => this is inconsistent with other models\n",
-    "# => Pegasus uses the pad_token as decoder_start_token_id, so that this could\n",
-    "# pose some problems.\n",
-    "if (\n",
-    "    attention_mask is None\n",
-    "    and input_ids is not None\n",
-    "    and input_shape[-1] > 1\n",
-    "    and config.pad_token_id in input_ids\n",
-    "):\n",
-    "    # should be kept for backwards compatibility\n",
-    "    attention_mask = input_ids.ne(config.pad_token_id).to(torch.long)\n",
-    "    # never mask leading token, even if it is pad\n",
-    "    attention_mask[:, 0] = attention_mask[:, 1]\n",
-    "    if past_key_values_length > 0:\n",
-    "        attention_mask = torch.cat(\n",
-    "            [\n",
-    "                torch.ones(\n",
-    "                    (input_shape[0], past_key_values_length), dtype=torch.long, device=input_ids.device\n",
-    "                ),\n",
-    "                attention_mask,\n",
-    "            ],\n",
-    "            dim=-1,\n",
-    "        )"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "res = bart.encoder(**bart.dummy_inputs, return_dict=True, output_attentions=True, output_hidden_states=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 97,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 16, 5, 5])"
-      ]
-     },
-     "execution_count": 97,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "res.attentions[-1].size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# expand encoder attention mask\n",
-    "encoder_hidden_states = None\n",
-    "encoder_attention_mask = None\n",
-    "\n",
-    "# BartModel에서 encoder의 결과값을 Decoder에 넣어줌!\n",
-    "if encoder_hidden_states is not None and encoder_attention_mask is not None:\n",
-    "    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]\n",
-    "    encoder_attention_mask = _expand_mask(\n",
-    "        encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])\n",
-    "else:\n",
-    "    print('지금은 None!')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "if attention_mask is not None and combined_attention_mask is not None:\n",
-    "    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]\n",
-    "    combined_attention_mask = combined_attention_mask + _expand_mask(\n",
-    "        attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">#### 아래의 과정으로 처리! (Encoder와 유사)\n",
-    ">- input_shape으로 position vector 얻음, past_key_values_length도 넣어줌\n",
-    ">\n",
-    ">#### Decoder에서 다른 점!\n",
-    ">- do_blenderbot_90_layernorm이 True가 아니라면 Encoder와 동일하게 계산\n",
-    "    - hidden_states를 input_embeds + embeds_pos로 계산\n",
-    "    - hidden_states를 LayerNorm해줌\n",
-    ">- do_blenderbot_90_layernorm가 True면\n",
-    "    - inputs_embeds를 LayerNorm해주고 (이 결과값이 hidden_states)\n",
-    "    - hidden_states에 embeds_pos를 더해줌\n",
-    ">- 이 후, Dropout\n",
-    ">- 그리고 나서 아래 값들에 대해 Tuple을 할당\n",
-    ">```python\n",
-    "all_hidden_states = () if output_hidden_states else None\n",
-    "all_self_attns = () if output_attentions else None\n",
-    "all_cross_attentions = () if output_attentions else None\n",
-    "next_decoder_cache = () if use_cache else None\n",
-    ">```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">- 그리고 나선, BartDecoderLayer별로 아래의 연산을 수행\n",
-    ">```python\n",
-    "for idx, decoder_layer in enumerate(self.layers): # Add idx\n",
-    "    # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)\n",
-    "    if output_hidden_states:\n",
-    "        # encoder_states = encoder_states + (hidden_states,) # Encoder\n",
-    "        all_hidden_states += (hidden_states,)\n",
-    "    dropout_probability = random.uniform(0, 1)\n",
-    "    if self.training and (dropout_probability < self.layerdrop):  # skip the layer\n",
-    "        # attn = None\n",
-    "        # Encoder에선 if output_attentions: 구문을 도는데\n",
-    "        # Decoder에선 걍 continue\n",
-    "        continue\n",
-    "    hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer(\n",
-    "        hidden_states, \n",
-    "        attention_mask=combined_attention_mask, \n",
-    "        encoder_hidden_states=encoder_hidden_states,\n",
-    "        encoder_attention_mask=encoder_attention_mask,\n",
-    "        past_key_value=past_key_value,\n",
-    "        output_attentions=output_attentions,\n",
-    "    )\n",
-    "    # Decoder에서 추가된 부분\n",
-    "    if use_cache:\n",
-    "        next_decoder_cache += (present_key_value,)\n",
-    "    if output_attentions:\n",
-    "        # all_attentions = all_attentions + (attn,)\n",
-    "        all_self_attns += (layer_self_attn,)\n",
-    "        all_cross_attentions += (layer_cross_attn,)\n",
-    "if output_hidden_states: # add hidden states from the last decoder layer\n",
-    "    all_hidden_states += (hidden_states,)\n",
-    ">```\n",
-    ">- output_hidden_states가 None이 아니면, all_hidden_states에 (hidden_states,)를 더해줌\n",
-    "    - encoder_states였었음\n",
-    ">- 그 다음, layer_norm이 None이 아니면 hidden_states를 layer normalization\n",
-    ">- use_cache가 True면 next_decoder_cache를, 아니면 None을 next_cache에 할당\n",
-    ">- return_dict가 True인지 False인지에 따라 출력 결과물이 달라짐\n",
-    "    - `False`: \n",
-    "    ```python \n",
-    "    tuple(\n",
-    "        v for v in [\n",
-    "            hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions\n",
-    "        ] if v is not None\n",
-    "    )\n",
-    "    ```\n",
-    "    - `True`:\n",
-    "    ```python\n",
-    "    BaseModelOutputWithPastAndCrossAttentions(\n",
-    "        last_hidden_state=hidden_states,\n",
-    "        past_key_values=next_cache,\n",
-    "        hidden_states=all_hidden_states,\n",
-    "        attentions=all_self_attns,\n",
-    "        cross_attentions=all_cross_attentions,\n",
-    "    )\n",
-    "    ```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## BartModel.forward"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Ch0. forward의 input"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "```python\n",
-    "input_ids              = None\n",
-    "attention_mask         = None\n",
-    "decoder_input_ids      = None\n",
-    "decoder_attention_mask = None\n",
-    "encoder_outputs        = None\n",
-    "past_key_values        = None\n",
-    "inputs_embeds          = None\n",
-    "decoder_inputs_embeds  = None\n",
-    "use_cache              = None\n",
-    "output_attentions      = None\n",
-    "output_hidden_states   = None\n",
-    "return_dict            = None\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### 1. config으로 input setting"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1\n",
-      "False\n",
-      "False\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "p(config.pad_token_id)\n",
-    "p(config.output_attentions)\n",
-    "p(config.output_hidden_states)\n",
-    "p(config.use_cache)\n",
-    "p(config.use_return_dict)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input_ids = bart.dummy_inputs['input_ids']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 113,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 5, 1024])"
-      ]
-     },
-     "execution_count": 113,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.encoder(input_ids).last_hidden_state.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 125,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "decoder_output = bart.decoder(\n",
-    "    input_ids, encoder_hidden_states=bart.encoder(input_ids).last_hidden_state)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 126,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "odict_keys(['last_hidden_state', 'past_key_values'])"
-      ]
-     },
-     "execution_count": 126,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "decoder_output.keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 127,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 5, 1024])"
-      ]
-     },
-     "execution_count": 127,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "decoder_output.last_hidden_state.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 143,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "reconstruction = torch.randn(32, 10, 1024)\n",
-    "reconstruction = torch.softmax(reconstruction, dim=-1)\n",
-    "\n",
-    "clean = torch.LongTensor(32, 10).random_(10000)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 145,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ValueError",
-     "evalue": "Expected target size (32, 1024), got torch.Size([32, 10])",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-145-7e25b81670f7>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mnn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCrossEntropyLoss\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreconstruction\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclean\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m    725\u001b[0m             \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    726\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 727\u001b[1;33m             \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    728\u001b[0m         for hook in itertools.chain(\n\u001b[0;32m    729\u001b[0m                 \u001b[0m_global_forward_hooks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\modules\\loss.py\u001b[0m in \u001b[0;36mforward\u001b[1;34m(self, input, target)\u001b[0m\n\u001b[0;32m    960\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    961\u001b[0m         return F.cross_entropy(input, target, weight=self.weight,\n\u001b[1;32m--> 962\u001b[1;33m                                ignore_index=self.ignore_index, reduction=self.reduction)\n\u001b[0m\u001b[0;32m    963\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    964\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\functional.py\u001b[0m in \u001b[0;36mcross_entropy\u001b[1;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[0;32m   2466\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0msize_average\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mreduce\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2467\u001b[0m         \u001b[0mreduction\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlegacy_get_string\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize_average\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2468\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mnll_loss\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlog_softmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreduction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   2469\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2470\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\functional.py\u001b[0m in \u001b[0;36mnll_loss\u001b[1;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[0;32m   2272\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2273\u001b[0m             raise ValueError('Expected target size {}, got {}'.format(\n\u001b[1;32m-> 2274\u001b[1;33m                 out_size, target.size()))\n\u001b[0m\u001b[0;32m   2275\u001b[0m         \u001b[0minput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontiguous\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   2276\u001b[0m         \u001b[0mtarget\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontiguous\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mValueError\u001b[0m: Expected target size (32, 1024), got torch.Size([32, 10])"
-     ]
-    }
-   ],
-   "source": [
-    "nn.CrossEntropyLoss()(reconstruction, clean)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 130,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{torch.Size([2, 16, 5, 64])}"
-      ]
-     },
-     "execution_count": 130,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "set([v.size() for pkv in decoder_output.past_key_values for v in pkv])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):\n",
-    "    \"\"\"\n",
-    "    Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).\n",
-    "    \"\"\"\n",
-    "    prev_output_tokens = input_ids.clone()\n",
-    "\n",
-    "    assert pad_token_id is not None, \"self.model.config.pad_token_id has to be defined.\"\n",
-    "    # replace possible -100 values in labels by `pad_token_id`\n",
-    "    prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)\n",
-    "\n",
-    "    index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)\n",
-    "    decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()\n",
-    "    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()\n",
-    "    prev_output_tokens[:, 0] = decoder_start_tokens\n",
-    "\n",
-    "    return prev_output_tokens"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 104,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 0,  6, 10,  4,  2],\n",
-       "        [ 0,  8, 12,  2,  1]])"
-      ]
-     },
-     "execution_count": 104,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "input_ids"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import BartTokenizer\n",
-    "\n",
-    "tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2"
-      ]
-     },
-     "execution_count": 109,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tokenizer.eos_token_id"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 105,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([[ 2,  0,  6, 10,  4],\n",
-       "        [ 2,  0,  8, 12,  2]])"
-      ]
-     },
-     "execution_count": 105,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "shift_tokens_right(input_ids, 1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Ch2. Encoder\n",
-    "#### Ch2.번외 Enc output -> BaseModelOutput setting\n",
-    "\n",
-    "- return_dict가 True이고\n",
-    "- encoder_outputs이 BaseModelOutput 객체가 아니면\n",
-    "- 아래 코드로 형변환시켜줌\n",
-    "\n",
-    "```python\n",
-    "encoder_outputs = BaseModelOutput(\n",
-    "    last_hidden_state=encoder_outputs[0],\n",
-    "    hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,\n",
-    "    attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,\n",
-    ")\n",
-    "\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Ch3. Decoder"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Ch4. 최종 output\n",
-    "- return_dict가 False일 경우엔\n",
-    "    - decoder_outputs + encoder_outputs를 출력\n",
-    "- 그 외의 경우엔\n",
-    "    - Seq2SeqModelOutput에 결과값을 입력 후 출력"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "output = bart(**bart.dummy_inputs)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "odict_keys(['last_hidden_state', 'past_key_values', 'encoder_last_hidden_state'])"
-      ]
-     },
-     "execution_count": 52,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "output.keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 5, 1024])"
-      ]
-     },
-     "execution_count": 54,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "output.last_hidden_state.size()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{torch.Size([2, 16, 5, 64])}"
-      ]
-     },
-     "execution_count": 72,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "set([v.size() for pkv in output.past_key_values for v in pkv])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(1024, 1024, 16, 16)"
-      ]
-     },
-     "execution_count": 79,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "16 * 64, config.d_model, config.encoder_attention_heads, config.decoder_attention_heads"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 74,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "torch.Size([2, 5, 1024])"
-      ]
-     },
-     "execution_count": 74,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "output.encoder_last_hidden_state.size()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Other Methods"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class BartModel(BartPretrainedModel):\n",
-    "    \n",
-    "    def __init__(self, config: BartConfig):\n",
-    "        pass\n",
-    "    \n",
-    "    @overrides\n",
-    "    def get_input_embeddings(self):\n",
-    "        return self.shared\n",
-    "        \n",
-    "    @overrides\n",
-    "    def set_input_embeddings(self, value):\n",
-    "        self.shared = value\n",
-    "        self.encoder.embed_tokens = self.shared\n",
-    "        self.decoder.embed_tokens = self.shared\n",
-    "        \n",
-    "    def get_encoder(self):\n",
-    "        return self.encoder\n",
-    "    \n",
-    "    def get_decoder(self):\n",
-    "        return self.decoder"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 번외"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### PreTrainedModel 분석"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers.configuration_utils import PretrainedConfig\n",
-    "\n",
-    "# file_utils.py\n",
-    "DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]\n",
-    "DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]\n",
-    "\n",
-    "pt_config = PretrainedConfig()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers.modeling_utils import ModuleUtilsMixin\n",
-    "from transformers.generation_utils import GenerationMixin # Beam Search 파보쟈"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(False, True)"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "pt_config.is_encoder_decoder, pt_config.tie_word_embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(True, True)"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config.is_encoder_decoder, config.tie_word_embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "({}, {})"
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "config.pruned_heads, pt_config.pruned_heads"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):\n",
-    "    config_class = None\n",
-    "    base_model_prefix = \"\"\n",
-    "    _keys_to_ignore_on_load_missing = None\n",
-    "    _keys_to_ignore_on_load_unexpected = None\n",
-    "    _keys_to_ignore_on_save = None\n",
-    "    \n",
-    "    @property\n",
-    "    def dummy_inputs(self) -> Dict[str, torch.Tensor]:\n",
-    "        return {'input_ids': torch.tensor(DUMMY_INPUTS)}\n",
-    "    \n",
-    "    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):\n",
-    "        super().__init__()\n",
-    "        if not isinstance(config, PretrainedConfig):\n",
-    "            raise ValueError(\n",
-    "                \"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. \"\n",
-    "                \"To create a model from a pretrained model use \"\n",
-    "                \"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`\".format(\n",
-    "                    self.__class__.__name__, self.__class__.__name__\n",
-    "                )\n",
-    "            )\n",
-    "        # Save config and origin of the pretrained weights if given in model\n",
-    "        self.config = config\n",
-    "        self.name_or_path = config.name_or_path\n",
-    "        \n",
-    "    @property\n",
-    "    def base_model(self) -> nn.Module:\n",
-    "        return getattr(self, self.base_model_prefix, self)\n",
-    "    \n",
-    "    def get_input_embeddings(self) -> nn.Module:\n",
-    "        base_model = getattr(self, self.base_model_prefix, self)\n",
-    "        if base_model is not self:\n",
-    "            return base_model.get_input_embeddings()\n",
-    "        else:\n",
-    "            raise NotImplementedError\n",
-    "            \n",
-    "    def set_input_embeddings(self, value: nn.Module):\n",
-    "        base_model = getattr(self, self.base_model_prefix, self)\n",
-    "        if base_model is not self:\n",
-    "            base_model.set_input_embeddings(value)\n",
-    "        else:\n",
-    "            raise NotImplementedError\n",
-    "            \n",
-    "    def get_output_embeddings(self) -> nn.Module:\n",
-    "        return None # Overwrite for models with output embeddings\n",
-    "    \n",
-    "    def tie_weights(self):\n",
-    "        output_embeddings = self.get_output_embeddings()\n",
-    "        if output_embeddings is not None and self.config.tie_word_embeddings:\n",
-    "            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())\n",
-    "\n",
-    "        if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:\n",
-    "            if hasattr(self, self.base_model_prefix):\n",
-    "                self = getattr(self, self.base_model_prefix)\n",
-    "            self._tie_encoder_decoder_weights(\n",
-    "                self.encoder, self.decoder, self.base_model_prefix)\n",
-    "            \n",
-    "    @staticmethod\n",
-    "    def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str):\n",
-    "        uninitialized_encoder_weights: List[str] = []\n",
-    "        \"\"\"\n",
-    "        1. encoder, decoder class가 같은지 체크!\n",
-    "            >> In this case make sure that all encoder weights are correctly initialized.\n",
-    "        2. weights를 recursively하게 tie\n",
-    "            >> tie_encoder_to_decoder_recursively 함수는 내부에서 구현되어있음\n",
-    "        \"\"\"\n",
-    "        tie_encoder_to_decoder_recursively(\n",
-    "            decoder, encoder, base_model_prefix, uninitialized_encoder_weights)\n",
-    "        \n",
-    "    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):\n",
-    "        \"\"\"\n",
-    "        Tie or clone module weights depending of whether we are using\n",
-    "        TorchScript or not\n",
-    "        \"\"\"\n",
-    "        if self.config.torchscript:\n",
-    "            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())\n",
-    "        else:\n",
-    "            output_embeddings.weight = input_embeddings.weight\n",
-    "\n",
-    "        if getattr(output_embeddings, \"bias\", None) is not None:\n",
-    "            output_embeddings.bias.data = torch.nn.functional.pad(\n",
-    "                output_embeddings.bias.data,\n",
-    "                (\n",
-    "                    0,\n",
-    "                    output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],\n",
-    "                ),\n",
-    "                \"constant\",\n",
-    "                0,\n",
-    "            )\n",
-    "        if hasattr(output_embeddings, \"out_features\") and hasattr(input_embeddings, \"num_embeddings\"):\n",
-    "            output_embeddings.out_features = input_embeddings.num_embeddings\n",
-    "            \n",
-    "    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:\n",
-    "        pass\n",
-    "    \n",
-    "    def _resize_token_embeddings(self, new_num_tokens):\n",
-    "        pass\n",
-    "    \n",
-    "    def _get_resized_embeddings(\n",
-    "        self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None\n",
-    "    ) -> torch.nn.Embedding:\n",
-    "        pass\n",
-    "    \n",
-    "    def _get_resized_lm_head(\n",
-    "        self, old_lm_head: torch.nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False\n",
-    "    ) -> torch.nn.Linear:\n",
-    "        pass\n",
-    "    \n",
-    "    def init_weights(self):\n",
-    "        \"\"\"\n",
-    "        Initializes and prunes weights if needed.\n",
-    "        \"\"\"\n",
-    "        # Initialize weights\n",
-    "        self.apply(self._init_weights)\n",
-    "\n",
-    "        # Prune heads if needed\n",
-    "        if self.config.pruned_heads:\n",
-    "            self.prune_heads(self.config.pruned_heads)\n",
-    "\n",
-    "        # Tie weights if needed\n",
-    "        self.tie_weights()\n",
-    "        \n",
-    "    def prune_heads(self, heads_to_prune: Dict[int, List[int]]):\n",
-    "        \"\"\"\n",
-    "        Prunes heads of the base model.\n",
-    "        Arguments:\n",
-    "            heads_to_prune (:obj:`Dict[int, List[int]]`):\n",
-    "                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of\n",
-    "                heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads\n",
-    "                0 and 2 on layer 1 and heads 2 and 3 on layer 2.\n",
-    "        \"\"\"\n",
-    "        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads\n",
-    "        for layer, heads in heads_to_prune.items():\n",
-    "            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)\n",
-    "            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON\n",
-    "\n",
-    "        self.base_model._prune_heads(heads_to_prune)\n",
-    "        \n",
-    "    def save_pretrained(self, save_directory: Union[str, os.PathLike]):\n",
-    "        pass\n",
-    "    \n",
-    "    @classmethod\n",
-    "    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):\n",
-    "        pass"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 68,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers.utils import logging"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "logger = logging.get_logger(__name__)\n",
-    "\n",
-    "logger.info(\n",
-    "    f\"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized.\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 58,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "bart.encoder.__class__ != bart.decoder.__class__"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### nn.Module의 apply 메서드"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from typing import TypeVar, Callable\n",
-    "\n",
-    "T = TypeVar('T', bound='Module')\n",
-    "\n",
-    "def apply(self: T, fn: Callable[['Module'], None]) -> T:\n",
-    "    for module in self.children():\n",
-    "        module.apply(fn)\n",
-    "    fn(self)\n",
-    "    return self"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Encoder, Decoder의 embed_positions\n",
-    "- forward의 인자가 tensor가 아니라 torch.Size!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class BartSinusoidalPositionalEmbedding(nn.Embedding):\n",
-    "    \"\"\"This module produces sinusoidal positional embeddings of any length.\"\"\"\n",
-    "\n",
-    "    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):\n",
-    "        super().__init__(num_positions, embedding_dim)\n",
-    "        self.weight = self._init_weight(self.weight)\n",
-    "\n",
-    "    @staticmethod\n",
-    "    def _init_weight(out: nn.Parameter):\n",
-    "        \"\"\"\n",
-    "        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. \n",
-    "        The cos features are in the 2nd half of the vector. [dim // 2:]\n",
-    "        \"\"\"\n",
-    "        n_pos, dim = out.shape\n",
-    "        position_enc = np.array(\n",
-    "            [\n",
-    "                [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]\n",
-    "                for pos in range(n_pos)\n",
-    "            ]\n",
-    "        )\n",
-    "        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+\n",
-    "        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1\n",
-    "        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))\n",
-    "        out[:, sentinel:]  = torch.FloatTensor(np.cos(position_enc[:, 1::2]))\n",
-    "        out.detach_()\n",
-    "        return out\n",
-    "\n",
-    "    @torch.no_grad()\n",
-    "    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):\n",
-    "        \"\"\"`input_ids_shape` is expected to be [bsz x seqlen].\"\"\"\n",
-    "        bsz, seq_len = input_ids_shape[:2]\n",
-    "        positions = torch.arange(\n",
-    "            past_key_values_length,\n",
-    "            past_key_values_length + seq_len, \n",
-    "            dtype=torch.long, \n",
-    "            device=self.weight.device\n",
-    "        )\n",
-    "        return super().forward(positions)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "$$\\cfrac{pos}{10000^{\\cfrac{2}{d_{model}}}}$$"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class BartLearnedPositionalEmbedding(nn.Embedding):\n",
-    "    \"\"\"\n",
-    "    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting\n",
-    "    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to\n",
-    "    the forward function.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(\n",
-    "        self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset: int\n",
-    "    ):\n",
-    "        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2\n",
-    "        # and adjust num_embeddings appropriately. Other models dont have this hack\n",
-    "        self.offset = offset\n",
-    "        assert padding_idx is not None, \"`padding_idx` should not be None, but of type int\"\n",
-    "        num_embeddings += offset\n",
-    "        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)\n",
-    "\n",
-    "    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):\n",
-    "        \"\"\"`input_ids_shape` is expected to be [bsz x seqlen].\"\"\"\n",
-    "        bsz, seq_len = input_ids_shape[:2]\n",
-    "        positions = torch.arange(\n",
-    "            past_key_values_length, \n",
-    "            past_key_values_length + seq_len, \n",
-    "            dtype=torch.long, \n",
-    "            device=self.weight.device\n",
-    "        )\n",
-    "        return super().forward(positions + self.offset)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### BartEncoderLayer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class BartEncoderLayer(nn.Module):\n",
-    "    def __init__(self, config: BartConfig):\n",
-    "        super().__init__()\n",
-    "        self.embed_dim = config.d_model\n",
-    "        self.self_attn = BartAttention(\n",
-    "            embed_dim=self.embed_dim,\n",
-    "            num_heads=config.encoder_attention_heads,\n",
-    "            dropout=config.attention_dropout,\n",
-    "        )\n",
-    "        self.normalize_before = config.normalize_before\n",
-    "        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)\n",
-    "        self.dropout = config.dropout\n",
-    "        self.activation_fn = ACT2FN[config.activation_function]\n",
-    "        self.activation_dropout = config.activation_dropout\n",
-    "        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)\n",
-    "        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)\n",
-    "        self.final_layer_norm = BartLayerNorm(self.embed_dim)\n",
-    "\n",
-    "    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):\n",
-    "        \"\"\"\n",
-    "        Args:\n",
-    "            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`\n",
-    "            attention_mask (:obj:`torch.FloatTensor`): attention mask of size\n",
-    "                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.\n",
-    "            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.\n",
-    "        \"\"\"\n",
-    "        residual = hidden_states\n",
-    "        if self.normalize_before:\n",
-    "            hidden_states = self.self_attn_layer_norm(hidden_states)\n",
-    "        hidden_states, attn_weights, _ = self.self_attn(\n",
-    "            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions\n",
-    "        )\n",
-    "        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n",
-    "        hidden_states = residual + hidden_states\n",
-    "        if not self.normalize_before:\n",
-    "            hidden_states = self.self_attn_layer_norm(hidden_states)\n",
-    "\n",
-    "        residual = hidden_states\n",
-    "        if self.normalize_before:\n",
-    "            hidden_states = self.final_layer_norm(hidden_states)\n",
-    "        hidden_states = self.activation_fn(self.fc1(hidden_states))\n",
-    "        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)\n",
-    "        hidden_states = self.fc2(hidden_states)\n",
-    "        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n",
-    "        hidden_states = residual + hidden_states\n",
-    "        if not self.normalize_before:\n",
-    "            hidden_states = self.final_layer_norm(hidden_states)\n",
-    "        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():\n",
-    "            clamp_value = torch.finfo(hidden_states.dtype).max - 1000\n",
-    "            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)\n",
-    "        return hidden_states, attn_weights"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### BartDecoderLayer"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class BartDecoderLayer(nn.Module):\n",
-    "    def __init__(self, config: BartConfig):\n",
-    "        super().__init__()\n",
-    "        self.embed_dim = config.d_model\n",
-    "\n",
-    "        self.self_attn = BartAttention(\n",
-    "            embed_dim=self.embed_dim,\n",
-    "            num_heads=config.decoder_attention_heads,\n",
-    "            dropout=config.attention_dropout,\n",
-    "            is_decoder=True,\n",
-    "        )\n",
-    "        self.dropout = config.dropout\n",
-    "        self.activation_fn = ACT2FN[config.activation_function]\n",
-    "        self.activation_dropout = config.activation_dropout\n",
-    "        self.normalize_before = config.normalize_before\n",
-    "\n",
-    "        self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)\n",
-    "        self.encoder_attn = BartAttention(\n",
-    "            self.embed_dim,\n",
-    "            config.decoder_attention_heads,\n",
-    "            dropout=config.attention_dropout,\n",
-    "            is_decoder=True,\n",
-    "        )\n",
-    "        self.encoder_attn_layer_norm = BartLayerNorm(self.embed_dim)\n",
-    "        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)\n",
-    "        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)\n",
-    "        self.final_layer_norm = BartLayerNorm(self.embed_dim)\n",
-    "\n",
-    "    def forward(\n",
-    "        self,\n",
-    "        hidden_states: torch.Tensor,\n",
-    "        attention_mask: Optional[torch.Tensor] = None,\n",
-    "        encoder_hidden_states: Optional[torch.Tensor] = None,\n",
-    "        encoder_attention_mask: Optional[torch.Tensor] = None,\n",
-    "        past_key_value: Optional[Tuple[torch.Tensor]] = None,\n",
-    "        output_attentions: Optional[torch.Tensor] = False,\n",
-    "    ):\n",
-    "        \"\"\"\n",
-    "        Args:\n",
-    "            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`\n",
-    "            attention_mask (:obj:`torch.FloatTensor`): attention mask of size\n",
-    "                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.\n",
-    "            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`\n",
-    "            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size\n",
-    "                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.\n",
-    "            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states\n",
-    "            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.\n",
-    "        \"\"\"\n",
-    "        residual = hidden_states\n",
-    "        if self.normalize_before:\n",
-    "            hidden_states = self.self_attn_layer_norm(hidden_states)\n",
-    "\n",
-    "        # Self Attention\n",
-    "        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2\n",
-    "        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None\n",
-    "        # add present self-attn cache to positions 1,2 of present_key_value tuple\n",
-    "        hidden_states, self_attn_weights, present_key_value = self.self_attn(\n",
-    "            hidden_states=hidden_states,\n",
-    "            past_key_value=self_attn_past_key_value,\n",
-    "            attention_mask=attention_mask,\n",
-    "            output_attentions=output_attentions,\n",
-    "        )\n",
-    "        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n",
-    "        hidden_states = residual + hidden_states\n",
-    "        if not self.normalize_before:\n",
-    "            hidden_states = self.self_attn_layer_norm(hidden_states)\n",
-    "\n",
-    "        # Cross-Attention Block\n",
-    "        cross_attn_present_key_value = None\n",
-    "        cross_attn_weights = None\n",
-    "        if encoder_hidden_states is not None:\n",
-    "            residual = hidden_states\n",
-    "            if self.normalize_before:\n",
-    "                hidden_states = self.encoder_attn_layer_norm(hidden_states)\n",
-    "\n",
-    "            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple\n",
-    "            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None\n",
-    "            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(\n",
-    "                hidden_states=hidden_states,\n",
-    "                key_value_states=encoder_hidden_states,\n",
-    "                attention_mask=encoder_attention_mask,\n",
-    "                past_key_value=cross_attn_past_key_value,\n",
-    "                output_attentions=output_attentions,\n",
-    "            )\n",
-    "            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n",
-    "            hidden_states = residual + hidden_states\n",
-    "            if not self.normalize_before:\n",
-    "                hidden_states = self.encoder_attn_layer_norm(hidden_states)\n",
-    "\n",
-    "            # add cross-attn to positions 3,4 of present_key_value tuple\n",
-    "            present_key_value = present_key_value + cross_attn_present_key_value\n",
-    "\n",
-    "        # Fully Connected\n",
-    "        residual = hidden_states\n",
-    "        if self.normalize_before:\n",
-    "            hidden_states = self.final_layer_norm(hidden_states)\n",
-    "        hidden_states = self.activation_fn(self.fc1(hidden_states))\n",
-    "        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)\n",
-    "        hidden_states = self.fc2(hidden_states)\n",
-    "        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n",
-    "        hidden_states = residual + hidden_states\n",
-    "        if not self.normalize_before:\n",
-    "            hidden_states = self.final_layer_norm(hidden_states)\n",
-    "\n",
-    "        return (\n",
-    "            hidden_states,\n",
-    "            self_attn_weights,\n",
-    "            present_key_value,\n",
-    "            cross_attn_weights,\n",
-    "        )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "basic",
-   "language": "python",
-   "name": "basic"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/BERT/BERT_ISSUES.md b/BERT/BERT_ISSUES.md
deleted file mode 100644
index c0e6a55..0000000
--- a/BERT/BERT_ISSUES.md
+++ /dev/null
@@ -1,162 +0,0 @@
-# [google-research/bert](https://github.com/google-research/bert/issues)의 Issues 공부
-
-## About BERT!
-### Paper Detail
-- [How is this counted? --> "3.3 billion word corpus"](https://github.com/google-research/bert/issues/1060)
-- [How is the number of BERT model parameters calculated?](https://github.com/google-research/bert/issues/656)
-
-### Implement Detail
-- [Question: What does "pooler layer" mean? Why it called pooler?](https://github.com/google-research/bert/issues/1102)
-- [Why we get last 4 layers while residual connection transfer useful knowledge to the subsequent layers?](https://github.com/google-research/bert/issues/1064)
-- [Infill](https://github.com/google-research/bert/pull/913) (very very 중요!! 코드 구현 어캐했는지 보고 나도 구현하기)
-
-### Training Detail
-- [Explain the variables in the checkpoint](https://github.com/google-research/bert/issues/1019)
-- [NotfoundError: Key bert/embeddings/LayerNorm/beta not found in checkpoint](https://github.com/google-research/bert/issues/997)
-- [NotFoundError: [_Derived_]No gradient defined for op: Einsum on Tensorflow 1.15](https://github.com/google-research/bert/issues/1012)
-- [How to see masked_lm_loss & next_sentence_loss per iteration step during train?](https://github.com/google-research/bert/issues/952)
-- [how use the pretrain checkpoint to continue train on my own corpus?](https://github.com/google-research/bert/issues/888)
-- [Can BERT really handle misspelled words?](https://github.com/google-research/bert/issues/812)
-- [Experiment using RAdam optimizer](https://github.com/google-research/bert/issues/810)
-- [Performance metrics of the classifier](https://github.com/google-research/bert/issues/800)
-- [Tutorial: A Pipeline Of Pretraining Bert On Google TPU](https://github.com/google-research/bert/issues/681)
-- [Determining training steps](https://github.com/google-research/bert/issues/662)
-- [Learning Rate and Warmup Steps](https://github.com/google-research/bert/issues/649)
-- [How to freeze layers of bert?](https://github.com/google-research/bert/issues/637)
-- [How often is the validation/evaluation performed? (fine-tuning using run_classifier.py)](https://github.com/google-research/bert/issues/636)
-- [How to get masked word prediction probabilities](https://github.com/google-research/bert/issues/608)
-
-## Tokenization
-- [How to handle labels when using the BERT wordpiece tokenizer](https://github.com/google-research/bert/issues/646)
-- [Tokenization behavior with messed-up unicode characters](https://github.com/google-research/bert/issues/1093)
-- [x] [fix korean tokenization bug](https://github.com/google-research/bert/pull/1070)
-- [x] [Update tokenization](https://github.com/google-research/bert/pull/1042)
-- [WordPiece Tokenizer Clarification](https://github.com/google-research/bert/issues/763)
-
-#### Vocabulary (답변 없는 경우 많음)
-- [use custom vocab.txt](https://github.com/google-research/bert/issues/1092)
-- [Adding custom domain words and abbreviations to vocab.txt](https://github.com/google-research/bert/issues/1083)
-- [update load_vocab() function based on ALBERT](https://github.com/google-research/bert/pull/961)
-- [Language dependent vocabulary?](https://github.com/google-research/bert/issues/641)
-
-#### Embedding
-- [What does bert embedding of a single term signify?](https://github.com/google-research/bert/issues/990)
-- [Bert sent embeddings](https://github.com/google-research/bert/pull/691)
-
-## Bert Pre-Training
-- [Pretraining BERT without next sentence prediction](https://github.com/google-research/bert/issues/178)
-- [Using my pre-trained model](https://github.com/google-research/bert/issues/1040)
-- [BERT pretraining num_train_steps questions](https://github.com/google-research/bert/issues/1025)
-- [BERT pre-training using only domain specific text](https://github.com/google-research/bert/issues/615)
-
-## Bert Fine-Tuning
-- [Does bert have this function ?](https://github.com/google-research/bert/issues/1024) (Bert for LM)
-- [Is it possible feed BERT to seq2seq encoder for NMT (for low resource language)?](https://github.com/google-research/bert/issues/1007) (답없음)
-- [extract_features sentence embedding BERT](https://github.com/google-research/bert/issues/1085)
-- [How does Google calculate a document embeddings using BERT in its new search?](https://github.com/google-research/bert/issues/957) (Fine-tune이라기 보단 feature-based일거 같지만... Google Search에서 어떻게 활용할지?)
-- [Exporting bert model to a saved model format](https://github.com/google-research/bert/issues/843) (tf serving)
-- [How to use BERT for ranking with Pairwise loss function during Finetuining](https://github.com/google-research/bert/issues/761)
-- [Serving fine-tuned Model - best solution](https://github.com/google-research/bert/issues/755) (bert in flask)
-- [Sentiment analysis on emoji data.](https://github.com/google-research/bert/issues/748)
-- [how to fine tune bert for ner on custom data](https://github.com/google-research/bert/issues/713)
-- [Tuned Bert Model on MRPC gives wrong predictions.](https://github.com/google-research/bert/issues/663)
-- [How to use run_squad.py to produce multiple answers for a question?](https://github.com/google-research/bert/issues/657)
-- [Using bert for Document Classification](https://github.com/google-research/bert/issues/650)
-- [Losing Knowledge for Language Model in Fine-Tuning](https://github.com/google-research/bert/issues/651)
-- [how to use BERT for Siamese Model paraphrase identify](https://github.com/google-research/bert/issues/648)
-- [Classification fine tuning for Q & A](https://github.com/google-research/bert/issues/639)
-
-## Distributed Training
-- [Exploding gradients in training BERT from scratch](https://github.com/google-research/bert/issues/1016)
-- [Can I run multi-gpu pretraining?](https://github.com/google-research/bert/issues/978)
-- [x] [Gradient Accumulation](https://github.com/google-research/bert/pull/976)
-- [multi-gpu horovod](https://github.com/google-research/bert/issues/743)
-- [Recommended GPU size when training BERT-base](https://github.com/google-research/bert/issues/645)
-
-## Open Issues
-- [How to create two BERT model with shared weights?](https://github.com/google-research/bert/issues/605)
-- [How to train our own domain-specific data instead of using pre-training models？](https://github.com/google-research/bert/issues/606)
-- [create_pretraining_data.py generates tfrecords that are too big](https://github.com/google-research/bert/issues/1161)
-- [How can i use BERT to correct the alignment and spellings in a sentence?](https://github.com/google-research/bert/issues/1154)
-- [Update the number of parameters](https://github.com/google-research/bert/pull/1150)
-- [Incomplete feature vectors generated by Bert model.](https://github.com/google-research/bert/issues/1145)
-- [Update tokenization.py](https://github.com/google-research/bert/pull/1117)
-- [Dealing with ellipses in BERT tokenization](https://github.com/google-research/bert/issues/1116)
-- [A spelling error is fixed](https://github.com/google-research/bert/pull/1168)
-- [How to use my own additional vocabulary dictionary?](https://github.com/google-research/bert/issues/396)
-- [Is there a plan to release code for fine-tuning on CoQA dataset?](https://github.com/google-research/bert/issues/597)
-- [How to use my own vocabulary when do pre-training from scratch?](https://github.com/google-research/bert/issues/589)
-- [BERT has a non deterministic behaviour](https://github.com/google-research/bert/issues/583)
-- [how to use bert to text summary](https://github.com/google-research/bert/issues/576)
-- [BERT multilingual for zero-shot classification](https://github.com/google-research/bert/issues/577)
-- [BERT encode emojis as [UNK] token](https://github.com/google-research/bert/issues/587)
-- [How to use BERT for sequence labelling](https://github.com/google-research/bert/issues/569)
-- [Added support for multi gpu training and distributed training using Horovod](https://github.com/google-research/bert/pull/568)
-- [How many articles (Wiki+Book corpus) do Bert use in pretraining?](https://github.com/google-research/bert/issues/570)
-- [Problem with wordpiece tokenization](https://github.com/google-research/bert/issues/560)
-- [problem multiclass text classification](https://github.com/google-research/bert/issues/559)
-- [IndexError in run_classifier.py::MrpcProcessor::_create_examples (2)](https://github.com/google-research/bert/issues/551)
-- [bad results after pretraining](https://github.com/google-research/bert/issues/529)
-- [Is BERT a kind of cheating?](https://github.com/google-research/bert/issues/514)
-- [Fixing normalized korean char](https://github.com/google-research/bert/pull/512)
-- [Are BERT word-embeddings capable of synonyms?](https://github.com/google-research/bert/issues/507)
-- [How to share BERT between tasks in multi-task setting?](https://github.com/google-research/bert/issues/504)
-- [add regression fine-tuning](https://github.com/google-research/bert/pull/503)
-- [Pre-trained monolingual in French](https://github.com/google-research/bert/issues/502)
-- [what is the synthetic self-training](https://github.com/google-research/bert/issues/488)
-- [Fine-Tune encodings on unsupervised data?](https://github.com/google-research/bert/issues/448)
-- [Using BERT with custom QA dataset](https://github.com/google-research/bert/issues/411)
-- [How can I change vocab size for pretrained model?](https://github.com/google-research/bert/issues/406)
-- [How to use my own additional vocabulary dictionary?](https://github.com/google-research/bert/issues/396)
-- [Can I use a "[CLS]...[SEP]...[SEP]...[SEP]" in tokens?](https://github.com/google-research/bert/issues/395)
-- [Weights from next sentence prediction](https://github.com/google-research/bert/issues/370)
-- [Optimize the code logic](https://github.com/google-research/bert/pull/366)
-- [BERT vs Word2vec](https://github.com/google-research/bert/issues/362)
-- [BERT for text summarization](https://github.com/google-research/bert/issues/352)
-- [Wiki Data Formation Problem, Need Sentence Split](https://github.com/google-research/bert/issues/341)
-- [how use BERT language model to predict next word](https://github.com/google-research/bert/issues/323)
-- [how to get fine_tune model output probability](https://github.com/google-research/bert/issues/322)
-- [how the model reflect 'bidirectional'?](https://github.com/google-research/bert/issues/319)
-- [Extract features return different layer values (vectors) each time, is it working well?](https://github.com/google-research/bert/issues/312)
-- [Is BERT powerful enough to learn sentence embedding and word embedding?](https://github.com/google-research/bert/issues/261)
-- [Gpu optimizations](https://github.com/google-research/bert/pull/255)
-- [Use BERT fine-tuned model for Tensorflow serving](https://github.com/google-research/bert/issues/146)
-- [What is BERT?](https://github.com/google-research/bert/issues/566)
-- [BERT with FP16 and XLA inference speed](https://github.com/google-research/bert/issues/391)
-
-## Closed Issues
-- [zero-shot for IsNext and NotNext function](https://github.com/google-research/bert/issues/1118)
-- [I don't know how to properly use fine tuned Bert Model](https://github.com/google-research/bert/issues/1097)
-- [why dropout at predicting time](https://github.com/google-research/bert/issues/1096)
-- [LayerNorm normalises the batch dimension as well](https://github.com/google-research/bert/issues/1088)
-- [MRPC Produces Two Vastly Different Eval Accuracy](https://github.com/google-research/bert/issues/1037)
-- [bert run_classifier](https://github.com/google-research/bert/issues/989)
-- [how to realize the tokenization of BERT model in c++](https://github.com/google-research/bert/issues/878)
-- [how to infer in python](https://github.com/google-research/bert/issues/614)
-- [Bert Context Based QA](https://github.com/google-research/bert/issues/620)
-- [Best performance on concatenated layers: which dimension?](https://github.com/google-research/bert/issues/511)
-- [Issue with multiclass text classification](https://github.com/google-research/bert/issues/449)
-- [What is exactly the learning rate warmup described in the paper?](https://github.com/google-research/bert/issues/425)
-- [Fine-Tuning specifications for MNLI/XNLI](https://github.com/google-research/bert/issues/328)
-- [fine-tuning with additional masked lm loss, and masked lm loss diverged](https://github.com/google-research/bert/issues/306)
-- [Handling domain specific vocabulary](https://github.com/google-research/bert/issues/237)
-- [Can you release the hyper-parameter of NER task?](https://github.com/google-research/bert/issues/223)
-- [Question about mask strategy.](https://github.com/google-research/bert/issues/169)
-- [Is CLS token also Masked in pre-training?](https://github.com/google-research/bert/issues/166)
-- [BERT Vector Space shows issues with unknown words](https://github.com/google-research/bert/issues/164)
-- [Simplifying BERT for Q&A - One paragraph and Query](https://github.com/google-research/bert/issues/159)
-- [Reproducing paper results from feature vectors (STS-B dataset)](https://github.com/google-research/bert/issues/161)
-- [Fine tuning BERT to extract embeddings (like ELMo)](https://github.com/google-research/bert/issues/145)
-- [Classification quality is depends on max_sequence_length](https://github.com/google-research/bert/issues/113)
-- [fine-tuned for a document task](https://github.com/google-research/bert/issues/107)
-- [When to stop training? What is a good valid loss value to stop ? How to improve classification performance?](https://github.com/google-research/bert/issues/95)
-- [plan to release SWAG code?](https://github.com/google-research/bert/issues/38)
-- [Add flag to extract only features for the [CLS] token](https://github.com/google-research/bert/pull/87)
-- [run_pretraining.py - clip gradient error: Found Inf or NaN global norm: Tensor had NaN value](https://github.com/google-research/bert/issues/82)
-- [How to train models on GPU instead of CPU when TPU is not available?](https://github.com/google-research/bert/issues/75)
-- [how to see loss per steps or epoch during train?](https://github.com/google-research/bert/issues/70)
-- [Extracting features on for long sequences / SQuAD](https://github.com/google-research/bert/issues/66)
-- [Trouble to understand position embedding.](https://github.com/google-research/bert/issues/58)
-- [PyTorch implementation](https://github.com/google-research/bert/issues/54)
-- [Plans to release sequence tagging task fine-tuning code?](https://github.com/google-research/bert/issues/33)
-- [w to get the word embedding after pre-training?](https://github.com/google-research/bert/issues/60)
diff --git a/BERT/README.md b/BERT/README.md
deleted file mode 100644
index 064f194..0000000
--- a/BERT/README.md
+++ /dev/null
@@ -1 +0,0 @@
-# BERT Implementation with PyTorch
diff --git a/ETRI_KorBERT.md b/ETRI_KorBERT.md
deleted file mode 100644
index dffc3ce..0000000
--- a/ETRI_KorBERT.md
+++ /dev/null
@@ -1,224 +0,0 @@
-# ETRI KorBERT 한국어 embedding 사용하기 및 적용 예시 :)
-2주 동안 source code 하나하나 뜯어가며 삽질한 노고를 기록하고 BERT에서 해당 코드가 어떠한 역할을 하는지 논문과 비교하며 설명!
-
-ETRI KorBERT로 이미 많은 분들이 활용하고 계시지만 친절하게 코드 하나하나 어떻게 해야한다는 연습 예제는 없더라!
-
-내가 공부한 내용을 공유하며 차근차근 예시 문제를 풀며 한국어 형태소 분석을 기반으로 하는 BERT를 활용하는 것이 이 repo의 목적이다!
-
-### Requirements
-- Tensorflow 1.15.0
-  - `huggingface`의 pytorch transformer 모델과 `google research`의 tensorflow 모델 둘 다 지원하지만
-  - 저는 google research의 `tensorflow` 버전을 활용하고 버전은 2.0 이전 버전 중 가장 최신 버전인 1.15.0 사용
-- ETRI에서 제공하는 model ckpt(checkpoint)와 vocab list
-  - 이는 저작권 상 Git에 올릴 수 없으니 **아래 ETRI 홈페이지에서 직접 openapi를 활용하여 받도록 한다.**
-  - [ETRI 학습 모델 및 데이터 제공](http://aiopen.etri.re.kr/service_dataset.php)
-  - ETRI에서 제공하는 버전은 총 4개이다.
-    ```
-    1. Pytorch + Morphology
-    2. Tensorflow + Morphology
-    3. Pytorch + Eojeol
-    4. Tensorflow + Eojeol
-    ```
-  - 형태소와 어절은 input을 형태소 분석을 하고 넣어줄 것인지, 아니면 pure text 자체를 넣어줄 것인지 여부의 차이만 존재할 뿐, 큰 차이가 없다.
-  - 중요한 것은 **사용하는 형태소 분석기는 TTA 표준 형태소 태그셋(TTAK.KO-11.0010/R1)에 호환되는 형태소분석기 사용**이 필요하다.
-  - [한국정보통신기술협회(Telecommuication Technology Association, TTA) 형태소 태그셋](http://aiopen.etri.re.kr/data/001.형태소분석_가이드라인.pdf)
-  - 이를 만족하는 형태소 분석기는 카카오 팀의 `khaiii`와 `ETRI`에서 제공하는 형태소 분석기가 존재한다.
-    - `konlpy.tag.Komoran`도 가능한지는 살펴봐야겠다. 형태소 분석 성능 면에서는 `ETRI`에서 제공하는 형태소 분석기가 더 좋았다.
-    - `khaiii`는 Window 환경에서 사용 불가능하다. Docker로 하는 방법 밖에는 없다.
-    - [khaiii docker 파일](https://github.com/kakao/khaiii/tree/master/docker)
-    - [Docker를 활용한 khaiii 설치수난기](https://medium.com/@saerombang11/docker를-활용한-khaiii-설치수난기-53d014f9eb58)
-- Python 3.7
-- 그 외 python library
-  - `six, numpy, scikit-learn, urllib3, urllib, pandas, konlpy, chatspace, pytorch`
-  
-### Example DataSets
-- [Dacon 금융문자 분석 경진대회](https://dacon.io/cpt14)
-  - KB 금융그룹에서 제공한 금융문자가 스미싱인지 vs 아닌지 이진분류를 수행하는 task
-  - 한국어 text 20만 건 이상 존재하고 있음
-
-- [Naver Sentiment Movie Corpus](https://github.com/e9t/nsmc)
-  - 73만 건의 naver movie reviews 데이터를 크롤링한 데이터
-  - rating 기반으로 긍/부정의 극성 분류를 시도
-  
-- [AI Hub DataSets](http://www.aihub.or.kr/)
-  - 적용 예정
-
-### Appendix
-- TTAK.KO-11.0010/R1
-
-  <table class="table table-striped table-bordered" style="width:1600px;">
-    <thead>
-      <tr>
-        <th style="width:300px" align="center">대분류</td>
-        <th style="width:300px" align="center">중분류</td>
-        <th style="width:1000px" align="center">대분류</td>
-      </tr>
-    </thead>
-    <tbody>
-      <tr>
-        <td rowspan="5" align="center">(1) 체언</td>
-        <td rowspan="3" align="center">명사</td>
-        <td align="center">일반명사(NNG)</td>
-      </tr>
-      <tr>
-        <td align="center">고유명사(NNP)</td>
-      </tr>
-      <tr>
-        <td align="center">의존명사(NNB)</td>
-      </tr>
-      <tr>
-        <td align="center">대명사(NP)</td>
-        <td align="center">대명사(NP)</td>
-      </tr>
-      <tr>
-        <td align="center">수사(NR)</td>
-        <td align="center">수사(NR)</td>
-      </tr>
-      <tr>
-        <td rowspan="5" align="center">(2) 용언</td>
-        <td align="center">동사(VV)</td>
-        <td align="center">동사(VV)</td>
-      </tr>
-      <tr>
-        <td align="center">형용사(VA)</td>
-        <td align="center">형용사(VA)</td>
-      </tr>
-      <tr>
-        <td align="center">보조용언(VX)</td>
-        <td align="center">보조용언(VX)</td>
-      </tr>
-      <tr>
-        <td rowspan="2" align="center">지정사(VC)</td>
-        <td align="center">긍정지정사(VCP)</td>
-      </tr>
-      <tr>
-        <td align="center">부정지정사(VCN)</td>
-      </tr>
-      <tr>
-        <td rowspan="5" align="center">(3) 수식언</td>
-        <td rowspan="3" align="center">관형사(MM)</td>
-        <td align="center">성상 관형사(MMA)</td>
-      </tr>
-      <tr>
-        <td align="center">지시 관형사(MMD)</td>
-      </tr>
-      <tr>
-        <td align="center">수 관형사(MMN)</td>
-      </tr>
-      <tr>
-        <td rowspan="2" align="center">부사(MA)</td>
-        <td align="center">일반부사(MAG)</td>        
-      </tr>
-      <tr>
-        <td align="center">접속부사(MAJ)</td>
-      </tr>
-      <tr>
-        <td align="center">(4) 독립언</td>
-        <td align="center">감탄사(IC)</td>
-        <td align="center">감탄사(IC)</td>
-      </tr>
-      <tr>
-        <td rowspan="9" align="center">(5) 관계언</td>
-        <td rowspan="7" align="center">격조사(JK)</td>
-        <td align="center">주격조사(JKS)</td>
-      </tr>
-      <tr>
-        <td align="center">보격조사(JKC)</td>
-      </tr>
-      <tr>
-        <td align="center">관형격조사(JKG)</td>
-      </tr>
-      <tr>
-        <td align="center">목적격조사(JKO)</td>
-      </tr>
-      <tr>
-        <td align="center">부사격조사(JKB)</td>
-      </tr>
-      <tr>
-        <td align="center">호격조사(JKV)</td>
-      </tr>
-      <tr>
-        <td align="center">인용격조사(JKQ)</td>
-      </tr>
-      <tr>
-        <td align="center">보조사(JX)</td>
-        <td align="center">보조사(JX)</td>
-      </tr>
-      <tr>
-        <td align="center">접속조사(JC)</td>
-        <td align="center">접속조사(JC)</td>
-      </tr>
-      <tr>
-        <td rowspan="10" align="center">(6) 의존형태</td>
-        <td rowspan="5" align="center">어미(EM)</td>
-        <td align="center">선어말어미(EP)</td>
-      </tr>
-      <tr>
-        <td align="center">종결어미(EF)</td>
-      </tr>
-      <tr>
-        <td align="center">연결어미(EC)</td>
-      </tr>
-      <tr>
-        <td align="center">명사형전성어미(ETN)</td>
-      </tr>
-      <tr>
-        <td align="center">관형형전성어미(ETM)</td>
-      </tr>
-      <tr>
-        <td align="center">접두사(XP)</td>
-        <td align="center">체언접두사(XPN)</td>
-      </tr>
-      <tr>
-        <td rowspan="3" align="center">접미사(XS)</td>
-        <td align="center">명사파생접미사(XSN)</td>
-      </tr>
-      <tr>
-        <td align="center">동사파생접미사(XSV)</td>
-      </tr>
-      <tr>
-        <td align="center">형용사파생접미사(XSA)</td>
-      </tr>
-      <tr>
-        <td align="center">어근(XR)</td>
-        <td align="center">어근(XR)</td>
-      </tr>
-      <tr>
-        <td rowspan="10" align="center">(7) 기초</td>
-        <td rowspan="6" align="center">일반기호(ST)</td>
-        <td align="center">마침표, 물음표, 느낌표(SF)</td>
-      </tr>
-      <tr>
-        <td align="center">쉼표, 가운뎃점, 콜론, 빗금(SP)</td>
-      </tr>  
-      <tr>
-        <td align="center">따옴표, 괄호표, 줄표(SS)</td>
-      </tr>
-      <tr>
-        <td align="center">줄임표(SE)</td>
-      </tr>
-      <tr>
-        <td align="center">붙임표(물결)(SO)</td>
-      </tr>
-      <tr>
-        <td align="center">기타 기호(SW)</td>
-      </tr>
-      <tr>
-        <td align="center">외국어(SL)</td>
-        <td align="center">외국어(SL)</td>
-      </tr>
-      <tr>
-        <td align="center">한자(SH)</td>
-        <td align="center">한자(SH)</td>
-      </tr>
-      <tr>
-        <td align="center">숫자(SN)</td>
-        <td align="center">숫자(SN)</td>
-      </tr>
-      <tr>
-        <td align="center">분석불능범주(NA)</td>
-        <td align="center">분석불능범주(NA)</td>
-      </tr>
-    </tdoby>
-  </table>
-
diff --git a/Hangul_Analyzer.py b/Hangul_Analyzer.py
deleted file mode 100644
index a0c5de5..0000000
--- a/Hangul_Analyzer.py
+++ /dev/null
@@ -1,97 +0,0 @@
-import urllib3
-import logging
-import json
-logger = logging.getLogger(__name__)
-from getpass import getpass
-
-from khaiii import KhaiiiApi
-from konlpy.tag import Komoran
-# from konlpy.tag import Mecab
-
-# If you use windows, try this.
-# !pip install eunjeon
-from eunjeon import Mecab
-
-# ETRI 형태소 분석기
-class ETRIMorphology:
-
-    def __init__(self):
-        self.openapiKey = self.get_apikey()
-        self.url = "http://aiopen.etri.re.kr:8000/WiseNLU"
-        self.requestJson = {"access_key": self.openapiKey,
-                            "argument": {"text": None, "analysis_code": "morp"}}
-        self.http = urllib3.PoolManager()
-
-    @staticmethod
-    def get_apikey():
-        openapikey = getpass('Type OpenAPI Key :')
-        return openapikey
-
-    @staticmethod
-    def _try_connect(openApiURL, requestJson):
-        response = self.http.request(
-            "POST", openApiURL,
-            headers={"Content-Type": "application/json; charset=UTF-8"},
-            body=json.dumps(requestJson))
-        return response
-
-    @staticmethod
-    def _get_json_result(response):
-        json_data = json.loads(response.data.decode('utf-8'))
-        return json_data
-
-    @staticmethod
-    def _check_valid_connect(json_data):
-        if json_data['result'] == -1:
-            if 'Invalid Access Key' in json_data['reason']:
-                logger.info(json_reason)
-                logger.info('Please check the openapi access key.')
-                sys.exit()
-            return "openapi error - " + json_reason
-        else:
-            return True
-
-    def do_lang(self, text):
-        self.requestJson['argument']['text'] = text
-        response = self._try_connect(self.url, self.requestJson)
-        json_data = self._get_json_result(response)
-        res = self._check_valid_connect(json_data)
-        if not res:
-            print(res)
-            return None
-        else:
-            json_return_obj = json_data['return_object']
-            return_result = ""
-            json_sentence = json_return_obj['sentence']
-            for json_morp in json_sentence:
-                for morp in json_morp['morp']:
-                    return_result += str(morp['lemma']) + '/' + str(morp['type']) + " "
-            return return_result[:-1]
-
-def Analyze(self, text, SEP=' + '):
-    """
-    KhaiiiApi의 분석 결과를 보기좋게 돌려주는 method
-
-    USAGE;
-    ```python
-    from khaiii import KhaiiiApi
-    khai3 = KhaiiiApi()
-    khai3.analyze('아버지가방에들어가신다 왜 자꾸 거리감들게할까 내 성격 리얼...')
-    >>> [<khaiii.khaiii.KhaiiiWord at 0x7f148e002710>,
-    >>>  <khaiii.khaiii.KhaiiiWord at 0x7f14a75442b0>,
-    >>>  <khaiii.khaiii.KhaiiiWord at 0x7f14a75441d0>,
-    >>>  <khaiii.khaiii.KhaiiiWord at 0x7f14a75443c8>,
-    >>>  <khaiii.khaiii.KhaiiiWord at 0x7f14a7544668>,
-    >>>  <khaiii.khaiii.KhaiiiWord at 0x7f14a7544780>,
-    >>>  <khaiii.khaiii.KhaiiiWord at 0x7f14a7544828>]
-
-    setattr(khai3.__class__, 'Analyze', Analyze)
-    khai3.Analze('아버지가방에들어가신다 왜 자꾸 거리감들게할까 내 성격 리얼...')
-    >>> '아버지/NNG + 가/JKS + 방/NNG + 에/JKB + 들어가/VV + 시/EP + ㄴ다/EC +
-         왜/MAG + 자꾸/MAG + 거리감/NNG + 들/VV + 게/EC + 하/VV + ㄹ까/EC +
-         나/NP + 의/JKG + 성격/NNG + 리/NNG + 얼/IC + ../SE + ./SF'
-    ```
-    """
-    res = self.analyze(text)
-    f = lambda x: x.__str__().split('\t')[1]
-    return SEP.join(list(map(f, res)))
diff --git a/README.md b/README.md
index 248fc79..272076f 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,8 @@
-# Korean Transformers
-- Transformers에 관련된 개념들 구현 레포
+# Advanced Transformers
 
-## 🤗 transformers에서 사용되는 특별한 기술들
-- gradient checkpoint
-- reversible residual connection
-- dynamic padding
-- chunk feed forward network
-- 3d, 4d multi-head scaled dot product attention
-- past key value
-- various positional embedding
-- various heads
-- porting script
-- generation mixin
-- parrallelism mixin
-- pushtohub mixin
-- how to make tokenization script?
-- trainer
-- various utils
+- WIP
 
-
-## Reference
-
-### BPE
-- [A New Algorithm for Data Compression](https://www.derczynski.com/papers/archive/BPE_Gage.pdf)
-
-### Wordpiece
-- [Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation](https://arxiv.org/abs/1609.08144)
-
-### Transformers
-- [Attention Is All You Need](https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf)
+## 프로젝트 목표
+- transformers v4.20.0 기준 131개 모델에 대한 공부 실시
+- 중복된 component들을 일반화하고 다양한 component들에 대한 공부 실시
+- 튜토리얼 자료를 만들어서 오픈 커뮤니티에 공개 (131개 모델들)
\ No newline at end of file
diff --git a/Untitled.ipynb b/Untitled.ipynb
deleted file mode 100644
index cd9e2b5..0000000
--- a/Untitled.ipynb
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "basic",
-   "language": "python",
-   "name": "basic"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/preprocessing_bert.ipynb b/preprocessing_bert.ipynb
deleted file mode 100644
index 727417c..0000000
--- a/preprocessing_bert.ipynb
+++ /dev/null
@@ -1,3788 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow as tf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from utils_20191230 import *"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.logging.set_verbosity(tf.logging.INFO)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### ETRI BERT Config\n",
-    "```python\n",
-    "bert_config = {\n",
-    "  \"attention_probs_dropout_prob\": 0.1, \n",
-    "  \"directionality\": \"bidi\", \n",
-    "  \"hidden_act\": \"gelu\", \n",
-    "  \"hidden_dropout_prob\": 0.1, \n",
-    "  \"hidden_size\": 768, \n",
-    "  \"initializer_range\": 0.02, \n",
-    "  \"intermediate_size\": 3072, \n",
-    "  \"max_position_embeddings\": 512, \n",
-    "  \"num_attention_heads\": 12, \n",
-    "  \"num_hidden_layers\": 12, \n",
-    "  \"pooler_fc_size\": 768, \n",
-    "  \"pooler_num_attention_heads\": 12, \n",
-    "  \"pooler_num_fc_layers\": 3, \n",
-    "  \"pooler_size_per_head\": 128, \n",
-    "  \"pooler_type\": \"first_token_transform\", \n",
-    "  \"type_vocab_size\": 2, \n",
-    "  \"vocab_size\": 30349\n",
-    "}\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# load ETRI Bert Config\n",
-    "path = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'\n",
-    "FLAGS.bert_config_file = path + 'bert_config.json'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bert_config = BertConfig.from_json_file(FLAGS.bert_config_file)\n",
-    "# bert_config = BertConfig.from_dict(bert_config) # 위의 dictionary를 메모리에 올려서 \n",
-    "                                                 # 다음 메서드로 호출하는 것도 가능하다."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "     (FLAGS) MAX_SEQ_LENGTH           : 128\n",
-      "(BERTConfig) MAX_POSITION_EMBEDDINGS  : 512\n"
-     ]
-    }
-   ],
-   "source": [
-    "if FLAGS.max_seq_length > bert_config.max_position_embeddings:\n",
-    "    raise ValueError(\n",
-    "        \"Cannot use sequence length %d because the BERT model \"\n",
-    "        \"was only trained up to sequence length %d\" %\n",
-    "        (FLAGS.max_seq_length, bert_config.max_position_embeddings))\n",
-    "else:\n",
-    "    print('     (FLAGS) MAX_SEQ_LENGTH           :', FLAGS.max_seq_length)\n",
-    "    print('(BERTConfig) MAX_POSITION_EMBEDDINGS  :', bert_config.max_position_embeddings)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Do not use TPU\n"
-     ]
-    }
-   ],
-   "source": [
-    "# do not use tpu\n",
-    "tpu_cluster_resolver = None\n",
-    "if FLAGS.use_tpu and FLAGS.tpu_name:\n",
-    "    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(\n",
-    "        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)\n",
-    "else:\n",
-    "    print('Do not use TPU')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:\n",
-      "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
-      "For more information, please see:\n",
-      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
-      "  * https://github.com/tensorflow/addons\n",
-      "  * https://github.com/tensorflow/io (for I/O related ops)\n",
-      "If you depend on functionality not listed there, please file an issue.\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n",
-    "run_config = tf.contrib.tpu.RunConfig(\n",
-    "    cluster=tpu_cluster_resolver,\n",
-    "    master=FLAGS.master,\n",
-    "    model_dir=FLAGS.output_dir,\n",
-    "    save_checkpoints_steps=FLAGS.save_checkpoints_steps, # 1000\n",
-    "    tpu_config=tf.contrib.tpu.TPUConfig(\n",
-    "        iterations_per_loop=FLAGS.iterations_per_loop, # 1000\n",
-    "        num_shards=FLAGS.num_tpu_cores, # 8\n",
-    "        per_host_input_for_training=is_per_host)\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# tensorflow gpu 사용 가능한지 체크\n",
-    "tf.test.is_gpu_available()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 한국어 vocab 사전을 등록\n",
-    "FLAGS.vocab_file = path + 'vocab.korean_morp.list' "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "간략한 파일 준비"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "dacon_path = '../dacon문자스미싱/filedown (2)/'\n",
-    "df_train = pd.read_csv(dacon_path + 'train.csv')\n",
-    "df_test = pd.read_csv(dacon_path + 'public_test.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "((236756, 2), (59189, 2), (236756,), (59189,))"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "df_train = df_train.set_index('id')\n",
-    "df_test = df_test.set_index('id')\n",
-    "\n",
-    "X_train, X_valid, y_train, y_valid = train_test_split(\n",
-    "                 df_train[[col for col in df_train.columns if col != 'smishing']], \n",
-    "                 df_train['smishing'],\n",
-    "                 random_state=42, test_size=.2,\n",
-    "                 stratify=df_train['smishing'])\n",
-    "X_train.shape, X_valid.shape, y_train.shape, y_valid.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_train = pd.concat((X_train, y_train), axis=1)\n",
-    "df_valid = pd.concat((X_valid, y_valid), axis=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# sample 100개씩 뽑아서 미리 test\n",
-    "df_train.sample(100).to_csv(dacon_path + 'train_100.tsv', sep='\\t')\n",
-    "df_valid.sample(100).to_csv(dacon_path + 'dev_100.tsv', sep='\\t')\n",
-    "df_test.sample(100).to_csv(dacon_path + 'test_100.tsv', sep='\\t')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# DataProcessor 제작\n",
-    "class SmishingProcessor(DataProcessor):\n",
-    "\n",
-    "    def get_train_examples(self, data_dir, filename='train.tsv'):\n",
-    "        return self._create_examples(\n",
-    "            self._read_tsv(os.path.join(data_dir, filename)), \"train\")\n",
-    "\n",
-    "    def get_dev_examples(self, data_dir, filename='dev.tsv'):\n",
-    "        return self._create_examples(\n",
-    "            self._read_tsv(os.path.join(data_dir, filename)), \"dev\")\n",
-    "\n",
-    "    def get_test_examples(self, data_dir, filename='test.tsv'):\n",
-    "        return self._create_examples(\n",
-    "            self._read_tsv(os.path.join(data_dir, filename)), \"test\")\n",
-    "\n",
-    "    def get_labels(self):\n",
-    "        return [\"0\", \"1\"]\n",
-    "\n",
-    "    def _create_examples(self, lines, set_type):\n",
-    "        examples = []\n",
-    "        for (i, line) in enumerate(lines):\n",
-    "            if i == 0:\n",
-    "                continue\n",
-    "            guid = \"%s-%s\" % (set_type, i)\n",
-    "            text_a = convert_to_unicode(line[2])\n",
-    "            if set_type == \"test\":\n",
-    "                label = \"0\"\n",
-    "            else:\n",
-    "                label = convert_to_unicode(line[-1])\n",
-    "            examples.append(\n",
-    "                InputExample(guid=guid, text_a=text_a, label=label))\n",
-    "        return examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "processor = SmishingProcessor()\n",
-    "label_list = processor.get_labels()\n",
-    "\n",
-    "# get train samples\n",
-    "train_examples = processor.get_train_examples(dacon_path, 'train_100.tsv')\n",
-    "num_train_steps = int(\n",
-    "    len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)\n",
-    "num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# record ETRI model weights\n",
-    "FLAGS.init_checkpoint = path + 'model.ckpt'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    num_labels=len(label_list),             # 2\n",
-    "    init_checkpoint=FLAGS.init_checkpoint,  # None\n",
-    "    learning_rate=FLAGS.learning_rate,      # 5e-05\n",
-    "    num_train_steps=num_train_steps,        # 22195\n",
-    "    num_warmup_steps=num_warmup_steps,      # 2219\n",
-    "    use_tpu=FLAGS.use_tpu,                  # False\n",
-    "    use_one_hot_embeddings=FLAGS.use_tpu)   # False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x000002196B19C0D0>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: C:\\Users\\jinma\\AppData\\Local\\Temp\\tmp95ip6j47\n",
-      "INFO:tensorflow:Using config: {'_model_dir': 'C:\\\\Users\\\\jinma\\\\AppData\\\\Local\\\\Temp\\\\tmp95ip6j47', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002196B1B3A20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None, eval_training_input_configuration=2, experimental_host_call_every_n_steps=1), '_cluster': None}\n",
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# If TPU is not available, this will fall back to normal Estimator on CPU\n",
-    "# or GPU\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=FLAGS.use_tpu,                        # False\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    train_batch_size=FLAGS.train_batch_size,      # 32\n",
-    "    eval_batch_size=FLAGS.eval_batch_size,        # 8\n",
-    "    predict_batch_size=FLAGS.predict_batch_size   # 8\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "FLAGS.output_dir = './output_dir/smishing/'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.gfile.MakeDirs(FLAGS.output_dir)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Embedding\n",
-    "- TTA 표준 형태소 태그셋(TTAK.KO-11.0010/R1)에 맞는 형태소 분석기를 사용해야 함."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<table class=\"table table-striped table-bordered\" style=\"width:800px;\">\n",
-    "  <thead>\n",
-    "    <tr>\n",
-    "      <th style=\"width:200px\" align=\"center\">대분류</td>\n",
-    "      <th style=\"width:200px\" align=\"center\">중분류</td>\n",
-    "      <th style=\"width:400px\" align=\"center\">대분류</td>\n",
-    "    </tr>\n",
-    "  </thead>\n",
-    "  <tbody>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"5\" align=\"center\">(1) 체언</td>\n",
-    "      <td rowspan=\"3\" align=\"center\">명사</td>\n",
-    "      <td align=\"center\">일반명사(NNG)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">고유명사(NNP)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">의존명사(NNB)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">대명사(NP)</td>\n",
-    "      <td align=\"center\">대명사(NP)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">수사(NR)</td>\n",
-    "      <td align=\"center\">수사(NR)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"5\" align=\"center\">(2) 용언</td>\n",
-    "      <td align=\"center\">동사(VV)</td>\n",
-    "      <td align=\"center\">동사(VV)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">형용사(VA)</td>\n",
-    "      <td align=\"center\">형용사(VA)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">보조용언(VX)</td>\n",
-    "      <td align=\"center\">보조용언(VX)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"2\" align=\"center\">지정사(VC)</td>\n",
-    "      <td align=\"center\">긍정지정사(VCP)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">부정지정사(VCN)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"5\" align=\"center\">(3) 수식언</td>\n",
-    "      <td rowspan=\"3\" align=\"center\">관형사(MM)</td>\n",
-    "      <td align=\"center\">성상 관형사(MMA)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">지시 관형사(MMD)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">수 관형사(MMN)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"2\" align=\"center\">부사(MA)</td>\n",
-    "      <td align=\"center\">일반부사(MAG)</td>        \n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">접속부사(MAJ)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">(4) 독립언</td>\n",
-    "      <td align=\"center\">감탄사(IC)</td>\n",
-    "      <td align=\"center\">감탄사(IC)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"9\" align=\"center\">(5) 관계언</td>\n",
-    "      <td rowspan=\"7\" align=\"center\">격조사(JK)</td>\n",
-    "      <td align=\"center\">주격조사(JKS)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">보격조사(JKC)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">관형격조사(JKG)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">목적격조사(JKO)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">부사격조사(JKB)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">호격조사(JKV)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">인용격조사(JKQ)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">보조사(JX)</td>\n",
-    "      <td align=\"center\">보조사(JX)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">접속조사(JC)</td>\n",
-    "      <td align=\"center\">접속조사(JC)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"10\" align=\"center\">(6) 의존형태</td>\n",
-    "      <td rowspan=\"5\" align=\"center\">어미(EM)</td>\n",
-    "      <td align=\"center\">선어말어미(EP)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">종결어미(EF)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">연결어미(EC)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">명사형전성어미(ETN)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">관형형전성어미(ETM)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">접두사(XP)</td>\n",
-    "      <td align=\"center\">체언접두사(XPN)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"3\" align=\"center\">접미사(XS)</td>\n",
-    "      <td align=\"center\">명사파생접미사(XSN)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">동사파생접미사(XSV)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">형용사파생접미사(XSA)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">어근(XR)</td>\n",
-    "      <td align=\"center\">어근(XR)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td rowspan=\"10\" align=\"center\">(7) 기초</td>\n",
-    "      <td rowspan=\"6\" align=\"center\">일반기호(ST)</td>\n",
-    "      <td align=\"center\">마침표, 물음표, 느낌표(SF)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">쉼표, 가운뎃점, 콜론, 빗금(SP)</td>\n",
-    "    </tr>  \n",
-    "    <tr>\n",
-    "      <td align=\"center\">따옴표, 괄호표, 줄표(SS)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">줄임표(SE)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">붙임표(물결)(SO)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">기타 기호(SW)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">외국어(SL)</td>\n",
-    "      <td align=\"center\">외국어(SL)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">한자(SH)</td>\n",
-    "      <td align=\"center\">한자(SH)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">숫자(SN)</td>\n",
-    "      <td align=\"center\">숫자(SN)</td>\n",
-    "    </tr>\n",
-    "    <tr>\n",
-    "      <td align=\"center\">분석불능범주(NA)</td>\n",
-    "      <td align=\"center\">분석불능범주(NA)</td>\n",
-    "    </tr>\n",
-    "  </tdoby>\n",
-    "</table>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# TTA 가이드라인\n",
-    "# http://aiopen.etri.re.kr/data/001.형태소분석_가이드라인.pdf\n",
-    "tta_guide = {\n",
-    "    '체언': {\n",
-    "        '명사': ('NN',\n",
-    "               {'일반명사': 'NNG',\n",
-    "                '고유명사': 'NNP',\n",
-    "                '의존명사': 'NNB'}),\n",
-    "        '대명사': ('NP',{'대명사': 'NP'}),\n",
-    "        '수사': ('NR', {'수사': 'NR'})\n",
-    "    },\n",
-    "    '용언': {\n",
-    "        '동사': ('VV', {'동사': 'VV'}),\n",
-    "        '형용사': ('VA', {'형용사': 'VA'}),\n",
-    "        '보조용언': ('VX', {'보조용언': 'VX'}),\n",
-    "        '지정사': ('VC', \n",
-    "                {'긍정지정사': 'VCP',\n",
-    "                 '부정지정사': 'VCN'})\n",
-    "    },\n",
-    "    '수식언': {\n",
-    "        '관형사': ('MM', \n",
-    "                {'성상 관형사': 'MMA',\n",
-    "                 '지시 관형사': 'MMD',\n",
-    "                 '수 관형사': 'MMN'}),\n",
-    "        '부사': ('MA', \n",
-    "               {'일반부사': 'MAG',\n",
-    "                '접속부사': 'MAJ'})\n",
-    "    },\n",
-    "    '독립언': {\n",
-    "        '감탄사': ('IC', {'감탄사': 'IC'})\n",
-    "    },\n",
-    "    '관계언': {\n",
-    "        '격조사': ('JK', \n",
-    "                {'주격조사': 'JKS',\n",
-    "                 '보격조사': 'JKC',\n",
-    "                 '관형격조사': 'JKG',\n",
-    "                 '목적격조사': 'JKO',\n",
-    "                 '부사격조사': 'JKB',\n",
-    "                 '호격조사': 'JKV',\n",
-    "                 '인용격조사': 'JKQ'}),\n",
-    "        '보조사': ('JX', {'보조사': 'JK'}),\n",
-    "        '접속조사': ('JC', {'접속조사': 'JC'})\n",
-    "    },\n",
-    "    '의존형태': {\n",
-    "        '어미': ('EM', \n",
-    "               {'선어말어미': 'EP',\n",
-    "                '종결어미': 'EF',\n",
-    "                '연결어미': 'EC',\n",
-    "                '명사형전성어미': 'ETN',\n",
-    "                '관형형전성어미': 'ETM'}),\n",
-    "        '접두사': ('XP', {'체언접두사': 'XPN'}),\n",
-    "        '접미사': ('XS', \n",
-    "                {'명사파생접미사': 'XSN',\n",
-    "                 '동사파생접미사': 'XSV',\n",
-    "                 '형용사파생접미사': 'XSA'}),\n",
-    "        '어근': ('XR', {'어근': 'XR'})\n",
-    "    },\n",
-    "    '기호': {\n",
-    "        '일반기호': ('ST', \n",
-    "                 {'마침표, 물음표, 느낌표': 'SF',\n",
-    "                  '쉼표, 가운뎃점, 콜론, 빗금': 'SP',\n",
-    "                  '따옴표, 괄호표, 줄표': 'SS',\n",
-    "                  '줄임표': 'SE',\n",
-    "                  '붙임표(물결)': 'SO',\n",
-    "                  '기타 기호': 'SW'}),\n",
-    "        '외국어': ('SL', {'외국어': 'SL'}),\n",
-    "        '한자': ('SH', {'한자': 'SH'}),\n",
-    "        '숫자': ('SN', {'숫자': 'SN'}),\n",
-    "        '분석불능범주': ('NA', {'분석불능범주': 'NA'})\n",
-    "    }\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'체언': {'명사': ('NN', {'일반명사': 'NNG', '고유명사': 'NNP', '의존명사': 'NNB'}),\n",
-       "  '대명사': ('NP', {'대명사': 'NP'}),\n",
-       "  '수사': ('NR', {'수사': 'NR'})},\n",
-       " '용언': {'동사': ('VV', {'동사': 'VV'}),\n",
-       "  '형용사': ('VA', {'형용사': 'VA'}),\n",
-       "  '보조용언': ('VX', {'보조용언': 'VX'}),\n",
-       "  '지정사': ('VC', {'긍정지정사': 'VCP', '부정지정사': 'VCN'})},\n",
-       " '수식언': {'관형사': ('MM', {'성상 관형사': 'MMA', '지시 관형사': 'MMD', '수 관형사': 'MMN'}),\n",
-       "  '부사': ('MA', {'일반부사': 'MAG', '접속부사': 'MAJ'})},\n",
-       " '독립언': {'감탄사': ('IC', {'감탄사': 'IC'})},\n",
-       " '관계언': {'격조사': ('JK',\n",
-       "   {'주격조사': 'JKS',\n",
-       "    '보격조사': 'JKC',\n",
-       "    '관형격조사': 'JKG',\n",
-       "    '목적격조사': 'JKO',\n",
-       "    '부사격조사': 'JKB',\n",
-       "    '호격조사': 'JKV',\n",
-       "    '인용격조사': 'JKQ'}),\n",
-       "  '보조사': ('JX', {'보조사': 'JK'}),\n",
-       "  '접속조사': ('JC', {'접속조사': 'JC'})},\n",
-       " '의존형태': {'어미': ('EM',\n",
-       "   {'선어말어미': 'EP',\n",
-       "    '종결어미': 'EF',\n",
-       "    '연결어미': 'EC',\n",
-       "    '명사형전성어미': 'ETN',\n",
-       "    '관형형전성어미': 'ETM'}),\n",
-       "  '접두사': ('XP', {'체언접두사': 'XPN'}),\n",
-       "  '접미사': ('XS', {'명사파생접미사': 'XSN', '동사파생접미사': 'XSV', '형용사파생접미사': 'XSA'}),\n",
-       "  '어근': ('XR', {'어근': 'XR'})},\n",
-       " '기호': {'일반기호': ('ST',\n",
-       "   {'마침표, 물음표, 느낌표': 'SF',\n",
-       "    '쉼표, 가운뎃점, 콜론, 빗금': 'SP',\n",
-       "    '따옴표, 괄호표, 줄표': 'SS',\n",
-       "    '줄임표': 'SE',\n",
-       "    '붙임표(물결)': 'SO',\n",
-       "    '기타 기호': 'SW'}),\n",
-       "  '외국어': ('SL', {'외국어': 'SL'}),\n",
-       "  '한자': ('SH', {'한자': 'SH'}),\n",
-       "  '숫자': ('SN', {'숫자': 'SN'}),\n",
-       "  '분석불능범주': ('NA', {'분석불능범주': 'NA'})}}"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tta_guide"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 63,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pos_set = []\n",
-    "for VALUE in tta_guide.values():\n",
-    "    for VALUE2 in VALUE.values():\n",
-    "        pos_set.append(VALUE2[0])\n",
-    "        for VALUE3 in VALUE2[1].values():\n",
-    "            pos_set.append(VALUE3)\n",
-    "pos_set = list(set(pos_set))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from konlpy.tag import Komoran\n",
-    "\n",
-    "komoran = Komoran()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 67,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[]"
-      ]
-     },
-     "execution_count": 67,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "[i[1] for i in komoran.pos(train_examples[0].text_a) if i[1] not in pos_set]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Komoran 형태소 분석기로 분석 실시"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for exam in train_examples:\n",
-    "    exam.text_a = ' '.join(\n",
-    "        [i[0] + '/' + i[1] \n",
-    "         for i in komoran.pos(exam.text_a)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'XXX/SL 지점/NNP 을/JKO 거래/NNG 하/XSV 아/EC 주/VX 시/EP 어서/EC 대단히/MAG 감사/NNG 하/XSV ㅂ니다/EF ./SF 내/NNP 점/NNB 하/NNP XXX/NNP 고객/NNG 님/XSN 고객/NNG 만족도/NNG 설문/NNP 조사/NNP 전화/NNG 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다라고/EC 우수/NNP 직원/NNP 추천/NNG 해주시/NNP 이/VCP 고/EC 사은품/NNG 받/VV 아/EC 가/VX 시/EP 어요/EC ../SE 더욱더/MAG 친절히/MAG 모시/VV 겠/EP 습니다/EC XXX/SL 은행/NNP XXX/NNP 올림/NNP'"
-      ]
-     },
-     "execution_count": 82,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "train_examples[0].text_a"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 243,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_file = os.path.join(FLAGS.output_dir, 'train.tf_record')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 106,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# fn: file_based_convert_examples_to_features\n",
-    "\n",
-    "## Arguments\n",
-    "examples = train_examples\n",
-    "label_list = label_list\n",
-    "max_seq_length = FLAGS.max_seq_length\n",
-    "# tokenizer = FullTokenizer() # 예시로 tokenizing을 어떻게 하는지 전부 기록\n",
-    "output_file = train_file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "writer = tf.python_io.TFRecordWriter(output_file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 108,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ex_index = 5\n",
-    "example = examples[ex_index]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 109,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# fn: convert_single_example\n",
-    "isinstance(example, PaddingInputExample)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'0': 0, '1': 1}"
-      ]
-     },
-     "execution_count": 110,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "label_map = {}\n",
-    "for (i, label) in enumerate(label_list):\n",
-    "    label_map[label] = i\n",
-    "label_map"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 111,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'XXX 고객님항상 XXX은행 모란역 지점을 이용해 주시는 고객님께 감사의 마음을 전합니다. 혹시 업무와 관련해 궁금한 점이 있으시면 이 번호로 연락주시기바랍니다. 성심껏 도와드리겠습니다. 또 혹시 고객만족도 조사 전화를 받으시면 매우 동의한다 로 칭찬해 주세요 조금은 쌀쌀한 10월의 첫주입니다.환절기  감기조심하시고 따듯한 차와 함께 건강한 한주 보내시기 바랍니다.XXX은행모란역XXX올림'"
-      ]
-     },
-     "execution_count": 111,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# class: FullTokenizer\n",
-    "path = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'\n",
-    "FLAGS.vocab_file = path + 'vocab.korean_morp.list' \n",
-    "vocab_file = FLAGS.vocab_file\n",
-    "do_lower_case = FLAGS.do_lower_case\n",
-    "text = example.text_a\n",
-    "text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 112,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vocab = load_vocab(vocab_file)\n",
-    "inv_vocab = {v:k for k, v in vocab.items()}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 113,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "openApiURL = \"http://aiopen.etri.re.kr:8000/WiseNLU\"\n",
-    "openapi_key = ''\n",
-    "requestJson = { \"access_key\": openapi_key, \"argument\": { \"text\": text, \"analysis_code\": \"morp\" } }"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 114,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'access_key': '0da70af6-e163-44b5-8d5d-f217bebb5765',\n",
-       " 'argument': {'text': 'XXX 고객님항상 XXX은행 모란역 지점을 이용해 주시는 고객님께 감사의 마음을 전합니다. 혹시 업무와 관련해 궁금한 점이 있으시면 이 번호로 연락주시기바랍니다. 성심껏 도와드리겠습니다. 또 혹시 고객만족도 조사 전화를 받으시면 매우 동의한다 로 칭찬해 주세요 조금은 쌀쌀한 10월의 첫주입니다.환절기  감기조심하시고 따듯한 차와 함께 건강한 한주 보내시기 바랍니다.XXX은행모란역XXX올림',\n",
-       "  'analysis_code': 'morp'}}"
-      ]
-     },
-     "execution_count": 114,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "requestJson"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import urllib3"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 116,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "http = urllib3.PoolManager()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 117,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "response = http.request( \"POST\", openApiURL, headers={\"Content-Type\": \"application/json; charset=UTF-8\"}, body=json.dumps(requestJson))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 118,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "json_data = json.loads(response.data.decode('utf-8'))\n",
-    "json_result = json_data[\"result\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 119,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "json_result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 120,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "json_data = json.loads(response.data.decode('utf-8'))\n",
-    "json_return_obj = json_data[\"return_object\"]\n",
-    "return_result = \"\"\n",
-    "json_sentence = json_return_obj[\"sentence\"]\n",
-    "for json_morp in json_sentence:                        \n",
-    "    for morp in json_morp[\"morp\"]:\n",
-    "        return_result = return_result+str(morp[\"lemma\"])+\"/\"+str(morp[\"type\"])+\" \""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 121,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG '"
-      ]
-     },
-     "execution_count": 121,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "return_result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 124,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'XXX/SL 고객/NNG 님/NNG 항상/MAG XXX/SL 은행/NNP 모란역/NNP 지점/NNG 을/JKO 이용/NNG 하/XSV 아/EC 주시/NNP 는/JX 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JC 관련/NNG 하/XSV 아/EC 궁금/XR 하/XSA ㄴ/ETM 점/NNB 이/JKS 있/VX 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/NNG 시기/NNG 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAJ 혹시/MAG 고객/NNP 만족/NNP 도/JX 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EC 로/NNG 칭찬/NNG 하/XSV 아/EC 주/VX 시/EP 어요/EC 조금/NNG 은/JX 쌀쌀/XR 하/XSA ㄴ/ETM 10월/NNP 의/JKG 첫/MM 주/NNB 이/VCP ㅂ니다/EF ./SF 환절기/NNG 감기/NNP 조심/NNG 하/XSV 시/EP 고/EC 따듯/XR 하/XSA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSV ㄴ/ETM 한/MM 주/NNP 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNP 모란역/NNP XXX/NNP 올림/NNP'"
-      ]
-     },
-     "execution_count": 124,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "' '.join([i[0] + '/' + i[1] for i in komoran.pos(text)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 125,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def do_lang ( openapi_key, text ) :\n",
-    "    openApiURL = \"http://aiopen.etri.re.kr:8000/WiseNLU\"\n",
-    "\t \n",
-    "    requestJson = { \"access_key\": openapi_key, \"argument\": { \"text\": text, \"analysis_code\": \"morp\" } }\n",
-    "\t \n",
-    "    http = urllib3.PoolManager()\n",
-    "    response = http.request( \"POST\", openApiURL, headers={\"Content-Type\": \"application/json; charset=UTF-8\"}, body=json.dumps(requestJson))\n",
-    "    \n",
-    "    json_data = json.loads(response.data.decode('utf-8'))\n",
-    "    json_result = json_data[\"result\"]\n",
-    "    \n",
-    "    if json_result == -1:\n",
-    "        json_reason = json_data[\"reason\"]\n",
-    "        if \"Invalid Access Key\" in json_reason:\n",
-    "            logger.info(json_reason)\n",
-    "            logger.info(\"Please check the openapi access key.\")\n",
-    "            sys.exit()\n",
-    "        return \"openapi error - \" + json_reason      \n",
-    "    else:\n",
-    "        json_data = json.loads(response.data.decode('utf-8'))\n",
-    "    \n",
-    "        json_return_obj = json_data[\"return_object\"]\n",
-    "        \n",
-    "        return_result = \"\"\n",
-    "        json_sentence = json_return_obj[\"sentence\"]\n",
-    "        for json_morp in json_sentence:                        \n",
-    "            for morp in json_morp[\"morp\"]:\n",
-    "                return_result = return_result+str(morp[\"lemma\"])+\"/\"+str(morp[\"type\"])+\" \"\n",
-    "\n",
-    "        return return_result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 127,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG '"
-      ]
-     },
-     "execution_count": 127,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "return_result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 130,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "ids_to_tokens = collections.OrderedDict(\n",
-    "    [(ids, tok) for tok, ids in vocab.items()])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 132,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "never_split=(\"[UNK]\", \"[SEP]\", \"[PAD]\", \"[CLS]\", \"[MASK]\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 133,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 133,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "do_lower_case"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 135,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/vocab.korean_morp.list'"
-      ]
-     },
-     "execution_count": 135,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "vocab_file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 138,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 138,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "os.path.isdir(vocab_file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 139,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pathlib import Path\n",
-    "\n",
-    "from torch.hub import _get_torch_home\n",
-    "\n",
-    "torch_cache_home = _get_torch_home()\n",
-    "    \n",
-    "default_cache_path = os.path.join(torch_cache_home, \"transformers\")\n",
-    "\n",
-    "PYTORCH_PRETRAINED_BERT_CACHE = Path(\n",
-    "    os.getenv(\"PYTORCH_TRANSFORMERS_CACHE\", os.getenv(\"PYTORCH_PRETRAINED_BERT_CACHE\", default_cache_path))\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 141,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 142,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 142,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "isinstance(vocab_file, Path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'C:\\\\Users\\\\jinma\\\\.cache\\\\torch\\\\transformers'"
-      ]
-     },
-     "execution_count": 144,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "if isinstance(TRANSFORMERS_CACHE, Path):\n",
-    "    cache_dir = str(TRANSFORMERS_CACHE)\n",
-    "cache_dir"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 148,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from urllib.parse import urlparse\n",
-    "\n",
-    "def is_remote_url(url_or_filename):\n",
-    "    parsed = urlparse(url_or_filename)\n",
-    "    return parsed.scheme in ('http', 'https', 's3')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 272,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['[CLS]']"
-      ]
-     },
-     "execution_count": 272,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tokens = []\n",
-    "tokens.append('[CLS]')\n",
-    "tokens"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 149,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 149,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "is_remote_url(vocab_file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 150,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "True"
-      ]
-     },
-     "execution_count": 150,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "os.path.exists(vocab_file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 151,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def cached_path(\n",
-    "    url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None\n",
-    "):\n",
-    "    if cache_dir is None:\n",
-    "        cache_dir = TRANSFORMERS_CACHE\n",
-    "    if isinstance(url_or_filename, Path):\n",
-    "        url_or_filename = str(url_or_filename)\n",
-    "    if isinstance(cache_dir, Path):\n",
-    "        cache_dir = str(cache_dir)\n",
-    "\n",
-    "    if is_remote_url(url_or_filename):\n",
-    "        # URL, so get it from the cache (downloading if necessary)\n",
-    "        return get_from_cache(\n",
-    "            url_or_filename,\n",
-    "            cache_dir=cache_dir,\n",
-    "            force_download=force_download,\n",
-    "            proxies=proxies,\n",
-    "            resume_download=resume_download,\n",
-    "            user_agent=user_agent,\n",
-    "        )\n",
-    "    elif os.path.exists(url_or_filename):\n",
-    "        # File, and it exists.\n",
-    "        return url_or_filename\n",
-    "    elif urlparse(url_or_filename).scheme == \"\":\n",
-    "        # File, but it doesn't exist.\n",
-    "        raise EnvironmentError(\"file {} not found\".format(url_or_filename))\n",
-    "    else:\n",
-    "        # Something unknown\n",
-    "        raise ValueError(\"unable to parse {} as a URL or as a local path\".format(url_or_filename))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 152,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/vocab.korean_morp.list'"
-      ]
-     },
-     "execution_count": 152,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "cached_path(vocab_file) # vocab file 반환"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 94,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def _clean_text(text):\n",
-    "    output = [] # char을 저장할 list 생성\n",
-    "    for char in text:\n",
-    "        # 텍스트에서 Char 단위로 출력\n",
-    "        cp = ord(char)\n",
-    "        if cp == 0 or cp == 0xfffd or _is_control(char):\n",
-    "            # \\x00이거나 �이거나 unicode cat.이 C로 시작할 경우\n",
-    "            # (개행문자 제외) output에 추가하지 않는다.\n",
-    "            continue\n",
-    "        if _is_whitespace(char):\n",
-    "            # 공백일 경우 \" \"으로 output에 추가\n",
-    "            output.append(\" \")\n",
-    "        else:\n",
-    "            # 이 외의 경우 전부 output에 추가\n",
-    "            output.append(char)\n",
-    "    # cleaning 작업을 거친 Text를 후처리하여 반환\n",
-    "    return \"\".join(output)\n",
-    "\n",
-    "# char 단위 함수들\n",
-    "def _is_whitespace(char):\n",
-    "    if char == \" \" or char == '\\t' or char == '\\n' or char == '\\r':\n",
-    "        # 개행문자이거나 띄어쓰기면 True 반환\n",
-    "        return True\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat == 'Zs':\n",
-    "        # unicode category가 Space Seperator면 True 반환\n",
-    "        return True\n",
-    "    # 이 외의 경우 전부 False 반환\n",
-    "    return False\n",
-    "\n",
-    "def _is_control(char):\n",
-    "    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n",
-    "        # 개행문자이면 False 반환\n",
-    "        return False\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat.startswith(\"C\"):\n",
-    "        # unicode category가\n",
-    "        # Cc(Control) \n",
-    "        # Cf(format)\n",
-    "        # Co(Private Use, is 0)\n",
-    "        # Cs(Surrrogate, is 0)일 경우, True 반환\n",
-    "        return True\n",
-    "    # 이 외의 경우 전부 False 반환\n",
-    "    return False\n",
-    "\n",
-    "def _is_punctuation(char):\n",
-    "    # 한국어 형태소 분석기이기 때문에 공백과 같은지 여부만 반환\n",
-    "    return char == ' '"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 157,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def whitespace_tokenize(text):\n",
-    "\t\"\"\"Runs basic whitespace cleaning and splitting on a peice of text.\"\"\"\n",
-    "\ttext = text.strip()\n",
-    "\tif not text:\n",
-    "\t\treturn []\n",
-    "\ttokens = text.split()\n",
-    "\treturn tokens"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 95,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def print_c(text, is_print):\n",
-    "    if is_print:\n",
-    "        print(text)\n",
-    "    else:\n",
-    "        print(end='')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 248,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# do_lower_case = False\n",
-    "do_lower_case = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 258,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "곧/NNG\n",
-      "곧/NNG\n",
-      "False\n",
-      "True\n",
-      "True\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('곧/NNG')\n",
-    "print(unicodedata.normalize(\"NFD\", '곧/NNG'))\n",
-    "print('곧/NNG' == unicodedata.normalize(\"NFD\", '곧/NNG'))\n",
-    "print('곧/NNG' == unicodedata.normalize(\"NFC\",\n",
-    "                    unicodedata.normalize(\"NFD\", '곧/NNG')))\n",
-    "print(unicodedata.normalize(\"NFC\", '곧/NNG') == '곧/NNG')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 264,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'    --|> Exists in vocab_file.'"
-      ]
-     },
-     "execution_count": 264,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "'{:>30}'.format('--|> Exists in vocab_file.')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 270,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "************** START TOKENING MORPHLOGY **************\n",
-      "\n",
-      "Origin Text:   XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG \n",
-      "\n",
-      "Cleaned Text:  XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG \n",
-      "\n",
-      "Orig. Tokens:  ['XXX/SL', '고객/NNG', '님/XSN', '항상/MAG', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', '지점/NNG', '을/JKO', '이용/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '는/ETM', '고객/NNG', '님/XSN', '께/JKB', '감사/NNG', '의/JKG', '마음/NNG', '을/JKO', '전하/VV', 'ㅂ니다/EF', './SF', '혹시/MAG', '업무/NNG', '와/JKB', '관련/NNG', '하/XSV', '어/EC', '궁금하/VA', 'ㄴ/ETM', '점/NNG', '이/JKS', '있/VA', '으시/EP', '면/EC', '이/MM', '번호/NNG', '로/JKB', '연락/NNG', '주/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', '성심껏/MAG', '돕/VV', '아/EC', '드리/VX', '겠/EP', '습니다/EF', './SF', '또/MAG', '혹시/MAG', '고객/NNG', '만족/NNG', '도/NNG', '조사/NNG', '전화/NNG', '를/JKO', '받/VV', '으시/EP', '면/EC', '매우/MAG', '동의/NNG', '하/XSV', 'ㄴ다/EF', '로/JKB', '칭찬/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '어요/EF', '조금/NNG', '은/JX', '쌀쌀하/VA', 'ㄴ/ETM', '10/SN', '월/NNB', '의/JKG', '첫주/NNG', '이/VCP', 'ㅂ니다/EF', './SF', '환절/NNG', '기/XSN', '감기/NNG', '조심/NNG', '하/XSV', '시/EP', '고/EC', '따듯하/VA', 'ㄴ/ETM', '차/NNG', '와/JC', '함께/MAG', '건강/NNG', '하/XSA', 'ㄴ/ETM', '한/MM', '주/NNB', '보내/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', 'XXX/SL', '올림/NNG']\n",
-      "\n",
-      "\tstripped accent+norm(NFD) Token : XXX/SL\n",
-      "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n",
-      "\tstripped accent+norm(NFD) Token : 고객/NNG\n",
-      "\tchars : ['ᄀ', 'ᅩ', 'ᄀ', 'ᅢ', 'ᆨ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 님/XSN\n",
-      "\tchars : ['ᄂ', 'ᅵ', 'ᆷ', '/', 'X', 'S', 'N']\n",
-      "\tstripped accent+norm(NFD) Token : 항상/MAG\n",
-      "\tchars : ['ᄒ', 'ᅡ', 'ᆼ', 'ᄉ', 'ᅡ', 'ᆼ', '/', 'M', 'A', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : XXX/SL\n",
-      "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n",
-      "\tstripped accent+norm(NFD) Token : 은행/NNG\n",
-      "\tchars : ['ᄋ', 'ᅳ', 'ᆫ', 'ᄒ', 'ᅢ', 'ᆼ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 모란/NNG\n",
-      "\tchars : ['ᄆ', 'ᅩ', 'ᄅ', 'ᅡ', 'ᆫ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 역/NNG\n",
-      "\tchars : ['ᄋ', 'ᅧ', 'ᆨ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 지점/NNG\n",
-      "\tchars : ['ᄌ', 'ᅵ', 'ᄌ', 'ᅥ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 을/JKO\n",
-      "\tchars : ['ᄋ', 'ᅳ', 'ᆯ', '/', 'J', 'K', 'O']\n",
-      "\tstripped accent+norm(NFD) Token : 이용/NNG\n",
-      "\tchars : ['ᄋ', 'ᅵ', 'ᄋ', 'ᅭ', 'ᆼ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 하/XSV\n",
-      "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 어/EC\n",
-      "\tchars : ['ᄋ', 'ᅥ', '/', 'E', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 주/VX\n",
-      "\tchars : ['ᄌ', 'ᅮ', '/', 'V', 'X']\n",
-      "\tstripped accent+norm(NFD) Token : 시/EP\n",
-      "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 는/ETM\n",
-      "\tchars : ['ᄂ', 'ᅳ', 'ᆫ', '/', 'E', 'T', 'M']\n",
-      "\tstripped accent+norm(NFD) Token : 고객/NNG\n",
-      "\tchars : ['ᄀ', 'ᅩ', 'ᄀ', 'ᅢ', 'ᆨ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 님/XSN\n",
-      "\tchars : ['ᄂ', 'ᅵ', 'ᆷ', '/', 'X', 'S', 'N']\n",
-      "\tstripped accent+norm(NFD) Token : 께/JKB\n",
-      "\tchars : ['ᄁ', 'ᅦ', '/', 'J', 'K', 'B']\n",
-      "\tstripped accent+norm(NFD) Token : 감사/NNG\n",
-      "\tchars : ['ᄀ', 'ᅡ', 'ᆷ', 'ᄉ', 'ᅡ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 의/JKG\n",
-      "\tchars : ['ᄋ', 'ᅴ', '/', 'J', 'K', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 마음/NNG\n",
-      "\tchars : ['ᄆ', 'ᅡ', 'ᄋ', 'ᅳ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 을/JKO\n",
-      "\tchars : ['ᄋ', 'ᅳ', 'ᆯ', '/', 'J', 'K', 'O']\n",
-      "\tstripped accent+norm(NFD) Token : 전하/VV\n",
-      "\tchars : ['ᄌ', 'ᅥ', 'ᆫ', 'ᄒ', 'ᅡ', '/', 'V', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n",
-      "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : ./SF\n",
-      "\tchars : ['.', '/', 'S', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : 혹시/MAG\n",
-      "\tchars : ['ᄒ', 'ᅩ', 'ᆨ', 'ᄉ', 'ᅵ', '/', 'M', 'A', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 업무/NNG\n",
-      "\tchars : ['ᄋ', 'ᅥ', 'ᆸ', 'ᄆ', 'ᅮ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 와/JKB\n",
-      "\tchars : ['ᄋ', 'ᅪ', '/', 'J', 'K', 'B']\n",
-      "\tstripped accent+norm(NFD) Token : 관련/NNG\n",
-      "\tchars : ['ᄀ', 'ᅪ', 'ᆫ', 'ᄅ', 'ᅧ', 'ᆫ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 하/XSV\n",
-      "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 어/EC\n",
-      "\tchars : ['ᄋ', 'ᅥ', '/', 'E', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 궁금하/VA\n",
-      "\tchars : ['ᄀ', 'ᅮ', 'ᆼ', 'ᄀ', 'ᅳ', 'ᆷ', 'ᄒ', 'ᅡ', '/', 'V', 'A']\n",
-      "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n",
-      "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n",
-      "\tstripped accent+norm(NFD) Token : 점/NNG\n",
-      "\tchars : ['ᄌ', 'ᅥ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 이/JKS\n",
-      "\tchars : ['ᄋ', 'ᅵ', '/', 'J', 'K', 'S']\n",
-      "\tstripped accent+norm(NFD) Token : 있/VA\n",
-      "\tchars : ['ᄋ', 'ᅵ', 'ᆻ', '/', 'V', 'A']\n",
-      "\tstripped accent+norm(NFD) Token : 으시/EP\n",
-      "\tchars : ['ᄋ', 'ᅳ', 'ᄉ', 'ᅵ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 면/EC\n",
-      "\tchars : ['ᄆ', 'ᅧ', 'ᆫ', '/', 'E', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 이/MM\n",
-      "\tchars : ['ᄋ', 'ᅵ', '/', 'M', 'M']\n",
-      "\tstripped accent+norm(NFD) Token : 번호/NNG\n",
-      "\tchars : ['ᄇ', 'ᅥ', 'ᆫ', 'ᄒ', 'ᅩ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 로/JKB\n",
-      "\tchars : ['ᄅ', 'ᅩ', '/', 'J', 'K', 'B']\n",
-      "\tstripped accent+norm(NFD) Token : 연락/NNG\n",
-      "\tchars : ['ᄋ', 'ᅧ', 'ᆫ', 'ᄅ', 'ᅡ', 'ᆨ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 주/VV\n",
-      "\tchars : ['ᄌ', 'ᅮ', '/', 'V', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 시/EP\n",
-      "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 기/ETN\n",
-      "\tchars : ['ᄀ', 'ᅵ', '/', 'E', 'T', 'N']\n",
-      "\tstripped accent+norm(NFD) Token : 바라/VV\n",
-      "\tchars : ['ᄇ', 'ᅡ', 'ᄅ', 'ᅡ', '/', 'V', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n",
-      "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : ./SF\n",
-      "\tchars : ['.', '/', 'S', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : 성심껏/MAG\n",
-      "\tchars : ['ᄉ', 'ᅥ', 'ᆼ', 'ᄉ', 'ᅵ', 'ᆷ', 'ᄁ', 'ᅥ', 'ᆺ', '/', 'M', 'A', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 돕/VV\n",
-      "\tchars : ['ᄃ', 'ᅩ', 'ᆸ', '/', 'V', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 아/EC\n",
-      "\tchars : ['ᄋ', 'ᅡ', '/', 'E', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 드리/VX\n",
-      "\tchars : ['ᄃ', 'ᅳ', 'ᄅ', 'ᅵ', '/', 'V', 'X']\n",
-      "\tstripped accent+norm(NFD) Token : 겠/EP\n",
-      "\tchars : ['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 습니다/EF\n",
-      "\tchars : ['ᄉ', 'ᅳ', 'ᆸ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : ./SF\n",
-      "\tchars : ['.', '/', 'S', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : 또/MAG\n",
-      "\tchars : ['ᄄ', 'ᅩ', '/', 'M', 'A', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 혹시/MAG\n",
-      "\tchars : ['ᄒ', 'ᅩ', 'ᆨ', 'ᄉ', 'ᅵ', '/', 'M', 'A', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 고객/NNG\n",
-      "\tchars : ['ᄀ', 'ᅩ', 'ᄀ', 'ᅢ', 'ᆨ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 만족/NNG\n",
-      "\tchars : ['ᄆ', 'ᅡ', 'ᆫ', 'ᄌ', 'ᅩ', 'ᆨ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 도/NNG\n",
-      "\tchars : ['ᄃ', 'ᅩ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 조사/NNG\n",
-      "\tchars : ['ᄌ', 'ᅩ', 'ᄉ', 'ᅡ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 전화/NNG\n",
-      "\tchars : ['ᄌ', 'ᅥ', 'ᆫ', 'ᄒ', 'ᅪ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 를/JKO\n",
-      "\tchars : ['ᄅ', 'ᅳ', 'ᆯ', '/', 'J', 'K', 'O']\n",
-      "\tstripped accent+norm(NFD) Token : 받/VV\n",
-      "\tchars : ['ᄇ', 'ᅡ', 'ᆮ', '/', 'V', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 으시/EP\n",
-      "\tchars : ['ᄋ', 'ᅳ', 'ᄉ', 'ᅵ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 면/EC\n",
-      "\tchars : ['ᄆ', 'ᅧ', 'ᆫ', '/', 'E', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 매우/MAG\n",
-      "\tchars : ['ᄆ', 'ᅢ', 'ᄋ', 'ᅮ', '/', 'M', 'A', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 동의/NNG\n",
-      "\tchars : ['ᄃ', 'ᅩ', 'ᆼ', 'ᄋ', 'ᅴ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 하/XSV\n",
-      "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : ㄴ다/EF\n",
-      "\tchars : ['ㄴ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : 로/JKB\n",
-      "\tchars : ['ᄅ', 'ᅩ', '/', 'J', 'K', 'B']\n",
-      "\tstripped accent+norm(NFD) Token : 칭찬/NNG\n",
-      "\tchars : ['ᄎ', 'ᅵ', 'ᆼ', 'ᄎ', 'ᅡ', 'ᆫ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 하/XSV\n",
-      "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 어/EC\n",
-      "\tchars : ['ᄋ', 'ᅥ', '/', 'E', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 주/VX\n",
-      "\tchars : ['ᄌ', 'ᅮ', '/', 'V', 'X']\n",
-      "\tstripped accent+norm(NFD) Token : 시/EP\n",
-      "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 어요/EF\n",
-      "\tchars : ['ᄋ', 'ᅥ', 'ᄋ', 'ᅭ', '/', 'E', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : 조금/NNG\n",
-      "\tchars : ['ᄌ', 'ᅩ', 'ᄀ', 'ᅳ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 은/JX\n",
-      "\tchars : ['ᄋ', 'ᅳ', 'ᆫ', '/', 'J', 'X']\n",
-      "\tstripped accent+norm(NFD) Token : 쌀쌀하/VA\n",
-      "\tchars : ['ᄊ', 'ᅡ', 'ᆯ', 'ᄊ', 'ᅡ', 'ᆯ', 'ᄒ', 'ᅡ', '/', 'V', 'A']\n",
-      "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n",
-      "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n",
-      "\tstripped accent+norm(NFD) Token : 10/SN\n",
-      "\tchars : ['1', '0', '/', 'S', 'N']\n",
-      "\tstripped accent+norm(NFD) Token : 월/NNB\n",
-      "\tchars : ['ᄋ', 'ᅯ', 'ᆯ', '/', 'N', 'N', 'B']\n",
-      "\tstripped accent+norm(NFD) Token : 의/JKG\n",
-      "\tchars : ['ᄋ', 'ᅴ', '/', 'J', 'K', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 첫주/NNG\n",
-      "\tchars : ['ᄎ', 'ᅥ', 'ᆺ', 'ᄌ', 'ᅮ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 이/VCP\n",
-      "\tchars : ['ᄋ', 'ᅵ', '/', 'V', 'C', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n",
-      "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : ./SF\n",
-      "\tchars : ['.', '/', 'S', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : 환절/NNG\n",
-      "\tchars : ['ᄒ', 'ᅪ', 'ᆫ', 'ᄌ', 'ᅥ', 'ᆯ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 기/XSN\n",
-      "\tchars : ['ᄀ', 'ᅵ', '/', 'X', 'S', 'N']\n",
-      "\tstripped accent+norm(NFD) Token : 감기/NNG\n",
-      "\tchars : ['ᄀ', 'ᅡ', 'ᆷ', 'ᄀ', 'ᅵ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 조심/NNG\n",
-      "\tchars : ['ᄌ', 'ᅩ', 'ᄉ', 'ᅵ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 하/XSV\n",
-      "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 시/EP\n",
-      "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 고/EC\n",
-      "\tchars : ['ᄀ', 'ᅩ', '/', 'E', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 따듯하/VA\n",
-      "\tchars : ['ᄄ', 'ᅡ', 'ᄃ', 'ᅳ', 'ᆺ', 'ᄒ', 'ᅡ', '/', 'V', 'A']\n",
-      "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n",
-      "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n",
-      "\tstripped accent+norm(NFD) Token : 차/NNG\n",
-      "\tchars : ['ᄎ', 'ᅡ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 와/JC\n",
-      "\tchars : ['ᄋ', 'ᅪ', '/', 'J', 'C']\n",
-      "\tstripped accent+norm(NFD) Token : 함께/MAG\n",
-      "\tchars : ['ᄒ', 'ᅡ', 'ᆷ', 'ᄁ', 'ᅦ', '/', 'M', 'A', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 건강/NNG\n",
-      "\tchars : ['ᄀ', 'ᅥ', 'ᆫ', 'ᄀ', 'ᅡ', 'ᆼ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 하/XSA\n",
-      "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'A']\n",
-      "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n",
-      "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n",
-      "\tstripped accent+norm(NFD) Token : 한/MM\n",
-      "\tchars : ['ᄒ', 'ᅡ', 'ᆫ', '/', 'M', 'M']\n",
-      "\tstripped accent+norm(NFD) Token : 주/NNB\n",
-      "\tchars : ['ᄌ', 'ᅮ', '/', 'N', 'N', 'B']\n",
-      "\tstripped accent+norm(NFD) Token : 보내/VV\n",
-      "\tchars : ['ᄇ', 'ᅩ', 'ᄂ', 'ᅢ', '/', 'V', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : 시/EP\n",
-      "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n",
-      "\tstripped accent+norm(NFD) Token : 기/ETN\n",
-      "\tchars : ['ᄀ', 'ᅵ', '/', 'E', 'T', 'N']\n",
-      "\tstripped accent+norm(NFD) Token : 바라/VV\n",
-      "\tchars : ['ᄇ', 'ᅡ', 'ᄅ', 'ᅡ', '/', 'V', 'V']\n",
-      "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n",
-      "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : ./SF\n",
-      "\tchars : ['.', '/', 'S', 'F']\n",
-      "\tstripped accent+norm(NFD) Token : XXX/SL\n",
-      "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n",
-      "\tstripped accent+norm(NFD) Token : 은행/NNG\n",
-      "\tchars : ['ᄋ', 'ᅳ', 'ᆫ', 'ᄒ', 'ᅢ', 'ᆼ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 모란/NNG\n",
-      "\tchars : ['ᄆ', 'ᅩ', 'ᄅ', 'ᅡ', 'ᆫ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : 역/NNG\n",
-      "\tchars : ['ᄋ', 'ᅧ', 'ᆨ', '/', 'N', 'N', 'G']\n",
-      "\tstripped accent+norm(NFD) Token : XXX/SL\n",
-      "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n",
-      "\tstripped accent+norm(NFD) Token : 올림/NNG\n",
-      "\tchars : ['ᄋ', 'ᅩ', 'ᆯ', 'ᄅ', 'ᅵ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "\n",
-      "Basic_split_tokens :  ['XXX/SL', '고객/NNG', '님/XSN', '항상/MAG', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', '지점/NNG', '을/JKO', '이용/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '는/ETM', '고객/NNG', '님/XSN', '께/JKB', '감사/NNG', '의/JKG', '마음/NNG', '을/JKO', '전하/VV', 'ㅂ니다/EF', './SF', '혹시/MAG', '업무/NNG', '와/JKB', '관련/NNG', '하/XSV', '어/EC', '궁금하/VA', 'ㄴ/ETM', '점/NNG', '이/JKS', '있/VA', '으시/EP', '면/EC', '이/MM', '번호/NNG', '로/JKB', '연락/NNG', '주/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', '성심껏/MAG', '돕/VV', '아/EC', '드리/VX', '겠/EP', '습니다/EF', './SF', '또/MAG', '혹시/MAG', '고객/NNG', '만족/NNG', '도/NNG', '조사/NNG', '전화/NNG', '를/JKO', '받/VV', '으시/EP', '면/EC', '매우/MAG', '동의/NNG', '하/XSV', 'ㄴ다/EF', '로/JKB', '칭찬/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '어요/EF', '조금/NNG', '은/JX', '쌀쌀하/VA', 'ㄴ/ETM', '10/SN', '월/NNB', '의/JKG', '첫주/NNG', '이/VCP', 'ㅂ니다/EF', './SF', '환절/NNG', '기/XSN', '감기/NNG', '조심/NNG', '하/XSV', '시/EP', '고/EC', '따듯하/VA', 'ㄴ/ETM', '차/NNG', '와/JC', '함께/MAG', '건강/NNG', '하/XSA', 'ㄴ/ETM', '한/MM', '주/NNB', '보내/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', 'XXX/SL', '올림/NNG']\n",
-      "\n",
-      "Basic_output Tokens:  ['XXX/SL', '고객/NNG', '님/XSN', '항상/MAG', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', '지점/NNG', '을/JKO', '이용/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '는/ETM', '고객/NNG', '님/XSN', '께/JKB', '감사/NNG', '의/JKG', '마음/NNG', '을/JKO', '전하/VV', 'ㅂ니다/EF', './SF', '혹시/MAG', '업무/NNG', '와/JKB', '관련/NNG', '하/XSV', '어/EC', '궁금하/VA', 'ㄴ/ETM', '점/NNG', '이/JKS', '있/VA', '으시/EP', '면/EC', '이/MM', '번호/NNG', '로/JKB', '연락/NNG', '주/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', '성심껏/MAG', '돕/VV', '아/EC', '드리/VX', '겠/EP', '습니다/EF', './SF', '또/MAG', '혹시/MAG', '고객/NNG', '만족/NNG', '도/NNG', '조사/NNG', '전화/NNG', '를/JKO', '받/VV', '으시/EP', '면/EC', '매우/MAG', '동의/NNG', '하/XSV', 'ㄴ다/EF', '로/JKB', '칭찬/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '어요/EF', '조금/NNG', '은/JX', '쌀쌀하/VA', 'ㄴ/ETM', '10/SN', '월/NNB', '의/JKG', '첫주/NNG', '이/VCP', 'ㅂ니다/EF', './SF', '환절/NNG', '기/XSN', '감기/NNG', '조심/NNG', '하/XSV', '시/EP', '고/EC', '따듯하/VA', 'ㄴ/ETM', '차/NNG', '와/JC', '함께/MAG', '건강/NNG', '하/XSA', 'ㄴ/ETM', '한/MM', '주/NNB', '보내/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', 'XXX/SL', '올림/NNG']\n",
-      "\n",
-      "************** START GREEDY LONGEST MATCH FIRST ALGORITHM **************\n",
-      "\t ['XXX/SL_']\n",
-      "\t\t\tXXX/SL_\n",
-      "\t\t\tXXX/SL\n",
-      "\t\t\tXXX/S\n",
-      "\t\t\tXXX/\n",
-      "\t\t\tXXX\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tXX/SL_\n",
-      "\t\t\tXX/SL\n",
-      "\t\t\tXX/S\n",
-      "\t\t\tXX/\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tX/SL_\r",
-      "\t\t\tX/SL_          --|> Exists in vocab_file.\n",
-      "\t ['고객/NNG_']\n",
-      "\t\t\t고객/NNG_\r",
-      "\t\t\t고객/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['님/XSN_']\n",
-      "\t\t\t님/XSN_\r",
-      "\t\t\t님/XSN_         --|> Exists in vocab_file.\n",
-      "\t ['항상/MAG_']\n",
-      "\t\t\t항상/MAG_\r",
-      "\t\t\t항상/MAG_        --|> Exists in vocab_file.\n",
-      "\t ['XXX/SL_']\n",
-      "\t\t\tXXX/SL_\n",
-      "\t\t\tXXX/SL\n",
-      "\t\t\tXXX/S\n",
-      "\t\t\tXXX/\n",
-      "\t\t\tXXX\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tXX/SL_\n",
-      "\t\t\tXX/SL\n",
-      "\t\t\tXX/S\n",
-      "\t\t\tXX/\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tX/SL_\r",
-      "\t\t\tX/SL_          --|> Exists in vocab_file.\n",
-      "\t ['은행/NNG_']\n",
-      "\t\t\t은행/NNG_\r",
-      "\t\t\t은행/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['모란/NNG_']\n",
-      "\t\t\t모란/NNG_\n",
-      "\t\t\t모란/NNG\n",
-      "\t\t\t모란/NN\n",
-      "\t\t\t모란/N\n",
-      "\t\t\t모란/\n",
-      "\t\t\t모란\n",
-      "\t\t\t모라\n",
-      "\t\t\t모ᄅ\n",
-      "\t\t\t모\r",
-      "\t\t\t모              --|> Exists in vocab_file.\n",
-      "\t\t\t란/NNG_\r",
-      "\t\t\t란/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['역/NNG_']\n",
-      "\t\t\t역/NNG_\r",
-      "\t\t\t역/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['지점/NNG_']\n",
-      "\t\t\t지점/NNG_\r",
-      "\t\t\t지점/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['을/JKO_']\n",
-      "\t\t\t을/JKO_\r",
-      "\t\t\t을/JKO_         --|> Exists in vocab_file.\n",
-      "\t ['이용/NNG_']\n",
-      "\t\t\t이용/NNG_\r",
-      "\t\t\t이용/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['하/XSV_']\n",
-      "\t\t\t하/XSV_\r",
-      "\t\t\t하/XSV_         --|> Exists in vocab_file.\n",
-      "\t ['어/EC_']\n",
-      "\t\t\t어/EC_\r",
-      "\t\t\t어/EC_          --|> Exists in vocab_file.\n",
-      "\t ['주/VX_']\n",
-      "\t\t\t주/VX_\r",
-      "\t\t\t주/VX_          --|> Exists in vocab_file.\n",
-      "\t ['시/EP_']\n",
-      "\t\t\t시/EP_\r",
-      "\t\t\t시/EP_          --|> Exists in vocab_file.\n",
-      "\t ['는/ETM_']\n",
-      "\t\t\t는/ETM_\r",
-      "\t\t\t는/ETM_         --|> Exists in vocab_file.\n",
-      "\t ['고객/NNG_']\n",
-      "\t\t\t고객/NNG_\r",
-      "\t\t\t고객/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['님/XSN_']\n",
-      "\t\t\t님/XSN_\r",
-      "\t\t\t님/XSN_         --|> Exists in vocab_file.\n",
-      "\t ['께/JKB_']\n",
-      "\t\t\t께/JKB_\r",
-      "\t\t\t께/JKB_         --|> Exists in vocab_file.\n",
-      "\t ['감사/NNG_']\n",
-      "\t\t\t감사/NNG_\r",
-      "\t\t\t감사/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['의/JKG_']\n",
-      "\t\t\t의/JKG_\r",
-      "\t\t\t의/JKG_         --|> Exists in vocab_file.\n",
-      "\t ['마음/NNG_']\n",
-      "\t\t\t마음/NNG_\r",
-      "\t\t\t마음/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['을/JKO_']\n",
-      "\t\t\t을/JKO_\r",
-      "\t\t\t을/JKO_         --|> Exists in vocab_file.\n",
-      "\t ['전하/VV_']\n",
-      "\t\t\t전하/VV_\r",
-      "\t\t\t전하/VV_         --|> Exists in vocab_file.\n",
-      "\t ['ㅂ니다/EF_']\n",
-      "\t\t\tㅂ니다/EF_\r",
-      "\t\t\tㅂ니다/EF_        --|> Exists in vocab_file.\n",
-      "\t ['./SF_']\n",
-      "\t\t\t./SF_\r",
-      "\t\t\t./SF_          --|> Exists in vocab_file.\n",
-      "\t ['혹시/MAG_']\n",
-      "\t\t\t혹시/MAG_\r",
-      "\t\t\t혹시/MAG_        --|> Exists in vocab_file.\n",
-      "\t ['업무/NNG_']\n",
-      "\t\t\t업무/NNG_\r",
-      "\t\t\t업무/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['와/JKB_']\n",
-      "\t\t\t와/JKB_\r",
-      "\t\t\t와/JKB_         --|> Exists in vocab_file.\n",
-      "\t ['관련/NNG_']\n",
-      "\t\t\t관련/NNG_\r",
-      "\t\t\t관련/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['하/XSV_']\n",
-      "\t\t\t하/XSV_\r",
-      "\t\t\t하/XSV_         --|> Exists in vocab_file.\n",
-      "\t ['어/EC_']\n",
-      "\t\t\t어/EC_\r",
-      "\t\t\t어/EC_          --|> Exists in vocab_file.\n",
-      "\t ['궁금하/VA_']\n",
-      "\t\t\t궁금하/VA_\r",
-      "\t\t\t궁금하/VA_        --|> Exists in vocab_file.\n",
-      "\t ['ㄴ/ETM_']\n",
-      "\t\t\tㄴ/ETM_\r",
-      "\t\t\tㄴ/ETM_         --|> Exists in vocab_file.\n",
-      "\t ['점/NNG_']\n",
-      "\t\t\t점/NNG_\r",
-      "\t\t\t점/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['이/JKS_']\n",
-      "\t\t\t이/JKS_\r",
-      "\t\t\t이/JKS_         --|> Exists in vocab_file.\n",
-      "\t ['있/VA_']\n",
-      "\t\t\t있/VA_\r",
-      "\t\t\t있/VA_          --|> Exists in vocab_file.\n",
-      "\t ['으시/EP_']\n",
-      "\t\t\t으시/EP_\r",
-      "\t\t\t으시/EP_         --|> Exists in vocab_file.\n",
-      "\t ['면/EC_']\n",
-      "\t\t\t면/EC_\r",
-      "\t\t\t면/EC_          --|> Exists in vocab_file.\n",
-      "\t ['이/MM_']\n",
-      "\t\t\t이/MM_\r",
-      "\t\t\t이/MM_          --|> Exists in vocab_file.\n",
-      "\t ['번호/NNG_']\n",
-      "\t\t\t번호/NNG_\r",
-      "\t\t\t번호/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['로/JKB_']\n",
-      "\t\t\t로/JKB_\r",
-      "\t\t\t로/JKB_         --|> Exists in vocab_file.\n",
-      "\t ['연락/NNG_']\n",
-      "\t\t\t연락/NNG_\r",
-      "\t\t\t연락/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['주/VV_']\n",
-      "\t\t\t주/VV_\r",
-      "\t\t\t주/VV_          --|> Exists in vocab_file.\n",
-      "\t ['시/EP_']\n",
-      "\t\t\t시/EP_\r",
-      "\t\t\t시/EP_          --|> Exists in vocab_file.\n",
-      "\t ['기/ETN_']\n",
-      "\t\t\t기/ETN_\r",
-      "\t\t\t기/ETN_         --|> Exists in vocab_file.\n",
-      "\t ['바라/VV_']\n",
-      "\t\t\t바라/VV_\r",
-      "\t\t\t바라/VV_         --|> Exists in vocab_file.\n",
-      "\t ['ㅂ니다/EF_']\n",
-      "\t\t\tㅂ니다/EF_\r",
-      "\t\t\tㅂ니다/EF_        --|> Exists in vocab_file.\n",
-      "\t ['./SF_']\n",
-      "\t\t\t./SF_\r",
-      "\t\t\t./SF_          --|> Exists in vocab_file.\n",
-      "\t ['성심껏/MAG_']\n",
-      "\t\t\t성심껏/MAG_\n",
-      "\t\t\t성심껏/MAG\n",
-      "\t\t\t성심껏/MA\n",
-      "\t\t\t성심껏/M\n",
-      "\t\t\t성심껏/\n",
-      "\t\t\t성심껏\n",
-      "\t\t\t성심꺼\n",
-      "\t\t\t성심ᄁ\n",
-      "\t\t\t성심\n",
-      "\t\t\t성시\n",
-      "\t\t\t성ᄉ\n",
-      "\t\t\t성\r",
-      "\t\t\t성              --|> Exists in vocab_file.\n",
-      "\t\t\t심껏/MAG_\n",
-      "\t\t\t심껏/MAG\n",
-      "\t\t\t심껏/MA\n",
-      "\t\t\t심껏/M\n",
-      "\t\t\t심껏/\n",
-      "\t\t\t심껏\n",
-      "\t\t\t심꺼\n",
-      "\t\t\t심ᄁ\n",
-      "\t\t\t심\r",
-      "\t\t\t심              --|> Exists in vocab_file.\n",
-      "\t\t\t껏/MAG_\r",
-      "\t\t\t껏/MAG_         --|> Exists in vocab_file.\n",
-      "\t ['돕/VV_']\n",
-      "\t\t\t돕/VV_\r",
-      "\t\t\t돕/VV_          --|> Exists in vocab_file.\n",
-      "\t ['아/EC_']\n",
-      "\t\t\t아/EC_\r",
-      "\t\t\t아/EC_          --|> Exists in vocab_file.\n",
-      "\t ['드리/VX_']\n",
-      "\t\t\t드리/VX_\r",
-      "\t\t\t드리/VX_         --|> Exists in vocab_file.\n",
-      "\t ['겠/EP_']\n",
-      "\t\t\t겠/EP_\r",
-      "\t\t\t겠/EP_          --|> Exists in vocab_file.\n",
-      "\t ['습니다/EF_']\n",
-      "\t\t\t습니다/EF_\r",
-      "\t\t\t습니다/EF_        --|> Exists in vocab_file.\n",
-      "\t ['./SF_']\n",
-      "\t\t\t./SF_\r",
-      "\t\t\t./SF_          --|> Exists in vocab_file.\n",
-      "\t ['또/MAG_']\n",
-      "\t\t\t또/MAG_\r",
-      "\t\t\t또/MAG_         --|> Exists in vocab_file.\n",
-      "\t ['혹시/MAG_']\n",
-      "\t\t\t혹시/MAG_\r",
-      "\t\t\t혹시/MAG_        --|> Exists in vocab_file.\n",
-      "\t ['고객/NNG_']\n",
-      "\t\t\t고객/NNG_\r",
-      "\t\t\t고객/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['만족/NNG_']\n",
-      "\t\t\t만족/NNG_\r",
-      "\t\t\t만족/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['도/NNG_']\n",
-      "\t\t\t도/NNG_\r",
-      "\t\t\t도/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['조사/NNG_']\n",
-      "\t\t\t조사/NNG_\r",
-      "\t\t\t조사/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['전화/NNG_']\n",
-      "\t\t\t전화/NNG_\r",
-      "\t\t\t전화/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['를/JKO_']\n",
-      "\t\t\t를/JKO_\r",
-      "\t\t\t를/JKO_         --|> Exists in vocab_file.\n",
-      "\t ['받/VV_']\n",
-      "\t\t\t받/VV_\r",
-      "\t\t\t받/VV_          --|> Exists in vocab_file.\n",
-      "\t ['으시/EP_']\n",
-      "\t\t\t으시/EP_\r",
-      "\t\t\t으시/EP_         --|> Exists in vocab_file.\n",
-      "\t ['면/EC_']\n",
-      "\t\t\t면/EC_\r",
-      "\t\t\t면/EC_          --|> Exists in vocab_file.\n",
-      "\t ['매우/MAG_']\n",
-      "\t\t\t매우/MAG_\r",
-      "\t\t\t매우/MAG_        --|> Exists in vocab_file.\n",
-      "\t ['동의/NNG_']\n",
-      "\t\t\t동의/NNG_\r",
-      "\t\t\t동의/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['하/XSV_']\n",
-      "\t\t\t하/XSV_\r",
-      "\t\t\t하/XSV_         --|> Exists in vocab_file.\n",
-      "\t ['ㄴ다/EF_']\n",
-      "\t\t\tㄴ다/EF_\r",
-      "\t\t\tㄴ다/EF_         --|> Exists in vocab_file.\n",
-      "\t ['로/JKB_']\n",
-      "\t\t\t로/JKB_\r",
-      "\t\t\t로/JKB_         --|> Exists in vocab_file.\n",
-      "\t ['칭찬/NNG_']\n",
-      "\t\t\t칭찬/NNG_\r",
-      "\t\t\t칭찬/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['하/XSV_']\n",
-      "\t\t\t하/XSV_\r",
-      "\t\t\t하/XSV_         --|> Exists in vocab_file.\n",
-      "\t ['어/EC_']\n",
-      "\t\t\t어/EC_\r",
-      "\t\t\t어/EC_          --|> Exists in vocab_file.\n",
-      "\t ['주/VX_']\n",
-      "\t\t\t주/VX_\r",
-      "\t\t\t주/VX_          --|> Exists in vocab_file.\n",
-      "\t ['시/EP_']\n",
-      "\t\t\t시/EP_\r",
-      "\t\t\t시/EP_          --|> Exists in vocab_file.\n",
-      "\t ['어요/EF_']\n",
-      "\t\t\t어요/EF_\r",
-      "\t\t\t어요/EF_         --|> Exists in vocab_file.\n",
-      "\t ['조금/NNG_']\n",
-      "\t\t\t조금/NNG_\r",
-      "\t\t\t조금/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['은/JX_']\n",
-      "\t\t\t은/JX_\r",
-      "\t\t\t은/JX_          --|> Exists in vocab_file.\n",
-      "\t ['쌀쌀하/VA_']\n",
-      "\t\t\t쌀쌀하/VA_\n",
-      "\t\t\t쌀쌀하/VA\n",
-      "\t\t\t쌀쌀하/V\n",
-      "\t\t\t쌀쌀하/\n",
-      "\t\t\t쌀쌀하\n",
-      "\t\t\t쌀쌀ᄒ\n",
-      "\t\t\t쌀쌀\n",
-      "\t\t\t쌀싸\n",
-      "\t\t\t쌀ᄊ\n",
-      "\t\t\t쌀\r",
-      "\t\t\t쌀              --|> Exists in vocab_file.\n",
-      "\t\t\t쌀하/VA_\n",
-      "\t\t\t쌀하/VA\n",
-      "\t\t\t쌀하/V\n",
-      "\t\t\t쌀하/\n",
-      "\t\t\t쌀하\n",
-      "\t\t\t쌀ᄒ\n",
-      "\t\t\t쌀\r",
-      "\t\t\t쌀              --|> Exists in vocab_file.\n",
-      "\t\t\t하/VA_\r",
-      "\t\t\t하/VA_          --|> Exists in vocab_file.\n",
-      "\t ['ㄴ/ETM_']\n",
-      "\t\t\tㄴ/ETM_\r",
-      "\t\t\tㄴ/ETM_         --|> Exists in vocab_file.\n",
-      "\t ['10/SN_']\n",
-      "\t\t\t10/SN_\r",
-      "\t\t\t10/SN_         --|> Exists in vocab_file.\n",
-      "\t ['월/NNB_']\n",
-      "\t\t\t월/NNB_\r",
-      "\t\t\t월/NNB_         --|> Exists in vocab_file.\n",
-      "\t ['의/JKG_']\n",
-      "\t\t\t의/JKG_\r",
-      "\t\t\t의/JKG_         --|> Exists in vocab_file.\n",
-      "\t ['첫주/NNG_']\n",
-      "\t\t\t첫주/NNG_\n",
-      "\t\t\t첫주/NNG\n",
-      "\t\t\t첫주/NN\n",
-      "\t\t\t첫주/N\n",
-      "\t\t\t첫주/\n",
-      "\t\t\t첫주\n",
-      "\t\t\t첫ᄌ\n",
-      "\t\t\t첫\r",
-      "\t\t\t첫              --|> Exists in vocab_file.\n",
-      "\t\t\t주/NNG_\r",
-      "\t\t\t주/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['이/VCP_']\n",
-      "\t\t\t이/VCP_\r",
-      "\t\t\t이/VCP_         --|> Exists in vocab_file.\n",
-      "\t ['ㅂ니다/EF_']\n",
-      "\t\t\tㅂ니다/EF_\r",
-      "\t\t\tㅂ니다/EF_        --|> Exists in vocab_file.\n",
-      "\t ['./SF_']\n",
-      "\t\t\t./SF_\r",
-      "\t\t\t./SF_          --|> Exists in vocab_file.\n",
-      "\t ['환절/NNG_']\n",
-      "\t\t\t환절/NNG_\n",
-      "\t\t\t환절/NNG\n",
-      "\t\t\t환절/NN\n",
-      "\t\t\t환절/N\n",
-      "\t\t\t환절/\n",
-      "\t\t\t환절\n",
-      "\t\t\t환저\n",
-      "\t\t\t환ᄌ\n",
-      "\t\t\t환\r",
-      "\t\t\t환              --|> Exists in vocab_file.\n",
-      "\t\t\t절/NNG_\r",
-      "\t\t\t절/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['기/XSN_']\n",
-      "\t\t\t기/XSN_\r",
-      "\t\t\t기/XSN_         --|> Exists in vocab_file.\n",
-      "\t ['감기/NNG_']\n",
-      "\t\t\t감기/NNG_\r",
-      "\t\t\t감기/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['조심/NNG_']\n",
-      "\t\t\t조심/NNG_\r",
-      "\t\t\t조심/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['하/XSV_']\n",
-      "\t\t\t하/XSV_\r",
-      "\t\t\t하/XSV_         --|> Exists in vocab_file.\n",
-      "\t ['시/EP_']\n",
-      "\t\t\t시/EP_\r",
-      "\t\t\t시/EP_          --|> Exists in vocab_file.\n",
-      "\t ['고/EC_']\n",
-      "\t\t\t고/EC_\r",
-      "\t\t\t고/EC_          --|> Exists in vocab_file.\n",
-      "\t ['따듯하/VA_']\n",
-      "\t\t\t따듯하/VA_\n",
-      "\t\t\t따듯하/VA\n",
-      "\t\t\t따듯하/V\n",
-      "\t\t\t따듯하/\n",
-      "\t\t\t따듯하\n",
-      "\t\t\t따듯ᄒ\n",
-      "\t\t\t따듯\n",
-      "\t\t\t따드\n",
-      "\t\t\t따ᄃ\n",
-      "\t\t\t따\r",
-      "\t\t\t따              --|> Exists in vocab_file.\n",
-      "\t\t\t듯하/VA_\r",
-      "\t\t\t듯하/VA_         --|> Exists in vocab_file.\n",
-      "\t ['ㄴ/ETM_']\n",
-      "\t\t\tㄴ/ETM_\r",
-      "\t\t\tㄴ/ETM_         --|> Exists in vocab_file.\n",
-      "\t ['차/NNG_']\n",
-      "\t\t\t차/NNG_\r",
-      "\t\t\t차/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['와/JC_']\n",
-      "\t\t\t와/JC_\r",
-      "\t\t\t와/JC_          --|> Exists in vocab_file.\n",
-      "\t ['함께/MAG_']\n",
-      "\t\t\t함께/MAG_\r",
-      "\t\t\t함께/MAG_        --|> Exists in vocab_file.\n",
-      "\t ['건강/NNG_']\n",
-      "\t\t\t건강/NNG_\r",
-      "\t\t\t건강/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['하/XSA_']\n",
-      "\t\t\t하/XSA_\r",
-      "\t\t\t하/XSA_         --|> Exists in vocab_file.\n",
-      "\t ['ㄴ/ETM_']\n",
-      "\t\t\tㄴ/ETM_\r",
-      "\t\t\tㄴ/ETM_         --|> Exists in vocab_file.\n",
-      "\t ['한/MM_']\n",
-      "\t\t\t한/MM_\r",
-      "\t\t\t한/MM_          --|> Exists in vocab_file.\n",
-      "\t ['주/NNB_']\n",
-      "\t\t\t주/NNB_\r",
-      "\t\t\t주/NNB_         --|> Exists in vocab_file.\n",
-      "\t ['보내/VV_']\n",
-      "\t\t\t보내/VV_\r",
-      "\t\t\t보내/VV_         --|> Exists in vocab_file.\n",
-      "\t ['시/EP_']\n",
-      "\t\t\t시/EP_\r",
-      "\t\t\t시/EP_          --|> Exists in vocab_file.\n",
-      "\t ['기/ETN_']\n",
-      "\t\t\t기/ETN_\r",
-      "\t\t\t기/ETN_         --|> Exists in vocab_file.\n",
-      "\t ['바라/VV_']\n",
-      "\t\t\t바라/VV_\r",
-      "\t\t\t바라/VV_         --|> Exists in vocab_file.\n",
-      "\t ['ㅂ니다/EF_']\n",
-      "\t\t\tㅂ니다/EF_\r",
-      "\t\t\tㅂ니다/EF_        --|> Exists in vocab_file.\n",
-      "\t ['./SF_']\n",
-      "\t\t\t./SF_\r",
-      "\t\t\t./SF_          --|> Exists in vocab_file.\n",
-      "\t ['XXX/SL_']\n",
-      "\t\t\tXXX/SL_\n",
-      "\t\t\tXXX/SL\n",
-      "\t\t\tXXX/S\n",
-      "\t\t\tXXX/\n",
-      "\t\t\tXXX\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tXX/SL_\n",
-      "\t\t\tXX/SL\n",
-      "\t\t\tXX/S\n",
-      "\t\t\tXX/\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tX/SL_\r",
-      "\t\t\tX/SL_          --|> Exists in vocab_file.\n",
-      "\t ['은행/NNG_']\n",
-      "\t\t\t은행/NNG_\r",
-      "\t\t\t은행/NNG_        --|> Exists in vocab_file.\n",
-      "\t ['모란/NNG_']\n",
-      "\t\t\t모란/NNG_\n",
-      "\t\t\t모란/NNG\n",
-      "\t\t\t모란/NN\n",
-      "\t\t\t모란/N\n",
-      "\t\t\t모란/\n",
-      "\t\t\t모란\n",
-      "\t\t\t모라\n",
-      "\t\t\t모ᄅ\n",
-      "\t\t\t모\r",
-      "\t\t\t모              --|> Exists in vocab_file.\n",
-      "\t\t\t란/NNG_\r",
-      "\t\t\t란/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['역/NNG_']\n",
-      "\t\t\t역/NNG_\r",
-      "\t\t\t역/NNG_         --|> Exists in vocab_file.\n",
-      "\t ['XXX/SL_']\n",
-      "\t\t\tXXX/SL_\n",
-      "\t\t\tXXX/SL\n",
-      "\t\t\tXXX/S\n",
-      "\t\t\tXXX/\n",
-      "\t\t\tXXX\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tXX/SL_\n",
-      "\t\t\tXX/SL\n",
-      "\t\t\tXX/S\n",
-      "\t\t\tXX/\n",
-      "\t\t\tXX\n",
-      "\t\t\tX\r",
-      "\t\t\tX              --|> Exists in vocab_file.\n",
-      "\t\t\tX/SL_\r",
-      "\t\t\tX/SL_          --|> Exists in vocab_file.\n",
-      "\t ['올림/NNG_']\n",
-      "\t\t\t올림/NNG_\r",
-      "\t\t\t올림/NNG_        --|> Exists in vocab_file.\n",
-      "\n",
-      "Total_split_tokens :  ['X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', 'X', 'X', 'X/SL_', '올림/NNG_']\n"
-     ]
-    }
-   ],
-   "source": [
-    "# FullTokenizer.tokenize(); End2End Tokenizer\n",
-    "text = copy.copy(return_result) # text 초기화\n",
-    "print('************** START TOKENING MORPHLOGY **************')\n",
-    "\n",
-    "# BasicTokenizer.tokenize()\n",
-    "print('\\nOrigin Text:  ', text)\n",
-    "# text = convert_to_unicode(text)\n",
-    "text = _clean_text(text)\n",
-    "print('\\nCleaned Text: ', text)\n",
-    "# fn: whitespace_tokenize()\n",
-    "orig_tokens = whitespace_tokenize(text)\n",
-    "print('\\nOrig. Tokens: ', orig_tokens, end='\\n\\n')\n",
-    "Basic_split_tokens = []\n",
-    "for token in orig_tokens:\n",
-    "    if (do_lower_case) and (token not in never_split):\n",
-    "#         token = token.lower()\n",
-    "        # fn: _run_strip_accents\n",
-    "        t = unicodedata.normalize(\"NFD\", token)\n",
-    "        # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d\n",
-    "        # 모든 음절을 정준 분해(Canonical Decomposition)시킴\n",
-    "        # '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일)\n",
-    "        output = []\n",
-    "        for char in t:\n",
-    "            cat = unicodedata.category(char)\n",
-    "            if cat == \"Mn\":\n",
-    "                # unicode category가 \"Mark, Nonspacing\"일 경우 pass\n",
-    "                continue\n",
-    "            output.append(char)\n",
-    "        token = \"\".join(output)\n",
-    "        print('\\tstripped accent+norm(NFD) Token : '+t)\n",
-    "    # fn: _run_split_on_punc()\n",
-    "    if text in never_split:\n",
-    "        token = [text]\n",
-    "    else:\n",
-    "        chars = list(token)\n",
-    "        i, start_new_word = 0, True\n",
-    "        output = []\n",
-    "        print('\\tchars : '+str(chars))\n",
-    "        while i < len(chars):\n",
-    "            char = chars[i]\n",
-    "            if _is_punctuation(char):\n",
-    "                # 공백이면 [\" \"]를 추가하고 새로운 단어로 시작\n",
-    "                output.append([char])\n",
-    "                start_new_word = True\n",
-    "            else:\n",
-    "                # 공백이 아닐 경우,\n",
-    "                if start_new_word:\n",
-    "                    # 새로운 문자로 시작할 경우 빈 리스트 추가.\n",
-    "                    output.append([])\n",
-    "                # 해당 단어부터 시작하도록 start_new_word는 False로 setting.\n",
-    "                start_new_word = False\n",
-    "                # 위에 추가한 빈 리스트에 각각 character를 채워넣음\n",
-    "                output[-1].append(char)\n",
-    "            i += 1\n",
-    "        token = [\"\".join(x) for x in output]\n",
-    "    Basic_split_tokens.extend(token)\n",
-    "print('\\nBasic_split_tokens : ', Basic_split_tokens)\n",
-    "Basic_output_tokens = whitespace_tokenize((' '.join(Basic_split_tokens)).strip())\n",
-    "print('\\nBasic_output Tokens: ', Basic_output_tokens, end='\\n\\n')\n",
-    "\n",
-    "Total_split_tokens = [] # 최종 tokenize 결과 저장\n",
-    "print('************** START GREEDY LONGEST MATCH FIRST ALGORITHM **************')\n",
-    "for tokens in Basic_output_tokens:\n",
-    "    tokens += '_' # adding '_'\n",
-    "    # WordpieceTokenizer.tokenize()\n",
-    "    unk_token = \"[UNK]\"\n",
-    "    max_input_chars_per_word = 100\n",
-    "    # greedy longest-match-first algorithm to perform tokenization\n",
-    "    # using the given vocabulary\n",
-    "    tokens = convert_to_unicode(tokens)\n",
-    "    WP_output_tokens = []\n",
-    "    # fn: whitespace_tokenize\n",
-    "    tokens = whitespace_tokenize(tokens)\n",
-    "    # start lmf algorithm!\n",
-    "    print('\\t', tokens)\n",
-    "    for token in tokens:\n",
-    "        chars = list(token)\n",
-    "        if len(chars) > max_input_chars_per_word: # 100\n",
-    "            # max word로 설정한 글자 수를 넘길 경우, UNK 처리\n",
-    "            WP_output_tokens.append(unk_token)\n",
-    "            continue\n",
-    "        is_bad = False\n",
-    "        start = 0\n",
-    "        sub_tokens = []\n",
-    "        while start < len(chars):\n",
-    "            end = len(chars)\n",
-    "            cur_substr = None\n",
-    "            # 첫번째 글자부터 천천히 vocab에 있는 단어인지 체크\n",
-    "            while start < end:\n",
-    "                substr = \"\".join(chars[start:end])\n",
-    "                # do_lower_case == True일 경우에\n",
-    "                # 위에서 Canonical Decomposition 과정을 거쳤기 때문에\n",
-    "                # 이를 다시 Composition해줘야 vocab의 단어와 비교 가능하다.\n",
-    "                substr = unicodedata.normalize('NFC', substr)\n",
-    "                print('\\t\\t\\t'+substr, end='')\n",
-    "                if substr in vocab:\n",
-    "                    # 만약 해당 단어가 vocab에 있다면 해당 단어로 break\n",
-    "                    cur_substr = substr\n",
-    "                    print('\\r\\t\\t\\t{:<15}{}'.format(\n",
-    "                        cur_substr, '--|> Exists in vocab_file.'))\n",
-    "                    break\n",
-    "                end -= 1\n",
-    "                print()\n",
-    "            # 만일 못찾았을 경우, (1)로 가서 [UNK] 처리.\n",
-    "            if cur_substr is None:\n",
-    "                is_bad = True\n",
-    "                break\n",
-    "            sub_tokens.append(cur_substr)\n",
-    "            # 어미를 추가하기 위해 start에 end값을 할당\n",
-    "            start = end\n",
-    "        if is_bad: # --- (1)\n",
-    "            WP_output_tokens.append(unk_token)\n",
-    "        else:\n",
-    "            # 정상적으로 끝났다면 sub_token을 결과값에 할당\n",
-    "            WP_output_tokens.extend(sub_tokens)\n",
-    "    for sub_token in WP_output_tokens:\n",
-    "        Total_split_tokens.append(sub_token)\n",
-    "print('\\nTotal_split_tokens : ', Total_split_tokens)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "단어 사전에 있는대로 짤라버린다."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 178,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokens_a = Total_split_tokens "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 181,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', 'X', 'X', 'X/SL_', '올림/NNG_']\n",
-      "cutting\n",
-      "['X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tokens_a)\n",
-    "if len(tokens_a) > max_seq_length - 2:\n",
-    "    print('cutting')\n",
-    "    tokens_a = tokens_a[:max_seq_length-2]\n",
-    "print(tokens_a)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 203,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokens = []\n",
-    "segment_ids = []\n",
-    "tokens.append(\"[CLS]\")\n",
-    "segment_ids.append(0)\n",
-    "for token in tokens_a:\n",
-    "    tokens.append(token)\n",
-    "    segment_ids.append(0)\n",
-    "tokens.append(\"[SEP]\")\n",
-    "segment_ids.append(0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 204,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(segment_ids)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 206,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['[CLS]', 'X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X', '[SEP]']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 207,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2, 3047, 3047, 1496, 1291, 1123, 2547, 3047, 3047, 1496, 994, 315, 1692, 375, 3277, 11, 456, 9, 20, 129, 388, 22, 1291, 1123, 3353, 1308, 13, 588, 11, 276, 158, 7, 5865, 1579, 101, 266, 9, 20, 4511, 10, 187, 16, 38, 4506, 71, 80, 1883, 31, 2597, 359, 388, 49, 2019, 158, 7, 270, 855, 5181, 2544, 62, 4971, 124, 116, 7, 179, 5865, 1291, 2379, 356, 268, 823, 19, 78, 4506, 71, 1210, 2680, 9, 41, 31, 3998, 9, 20, 129, 388, 526, 4380, 21, 9212, 9212, 248, 10, 113, 60, 13, 4508, 211, 15, 158, 7, 1807, 1509, 284, 9869, 4315, 9, 388, 23, 2577, 9115, 10, 208, 56, 162, 1394, 42, 10, 92, 2227, 561, 388, 49, 2019, 158, 7, 3047, 3047, 3]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# convert_tokens_to_ids\n",
-    "def convert_by_vocab(vocab, items):\n",
-    "    output = []\n",
-    "    for item in items:\n",
-    "        output.append(vocab[item])\n",
-    "    return output\n",
-    "\n",
-    "input_ids = convert_by_vocab(vocab, tokens)\n",
-    "print(input_ids)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 208,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n"
-     ]
-    }
-   ],
-   "source": [
-    "# The mask has 1 for real tokens and 0 for padding tokens. Only real\n",
-    "# tokens are attended to.\n",
-    "input_mask = [1] * len(input_ids)\n",
-    "print(input_mask)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 209,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Zero-pad up to the sequence length.\n",
-    "while len(input_ids) < max_seq_length:\n",
-    "    input_ids.append(0)\n",
-    "    input_mask.append(0)\n",
-    "    segment_ids.append(0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 210,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assert len(input_ids) == max_seq_length\n",
-    "assert len(input_mask) == max_seq_length\n",
-    "assert len(segment_ids) == max_seq_length"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 213,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 213,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "label_id = label_map[example.label]\n",
-    "label_id"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 216,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:guid: train-6\n",
-      "INFO:tensorflow:tokens\" [CLS] X X X/SL_ 고객/NNG_ 님/XSN_ 항상/MAG_ X X X/SL_ 은행/NNG_ 모 란/NNG_ 역/NNG_ 지점/NNG_ 을/JKO_ 이용/NNG_ 하/XSV_ 어/EC_ 주/VX_ 시/EP_ 는/ETM_ 고객/NNG_ 님/XSN_ 께/JKB_ 감사/NNG_ 의/JKG_ 마음/NNG_ 을/JKO_ 전하/VV_ ㅂ니다/EF_ ./SF_ 혹시/MAG_ 업무/NNG_ 와/JKB_ 관련/NNG_ 하/XSV_ 어/EC_ 궁금하/VA_ ㄴ/ETM_ 점/NNG_ 이/JKS_ 있/VA_ 으시/EP_ 면/EC_ 이/MM_ 번호/NNG_ 로/JKB_ 연락/NNG_ 주/VV_ 시/EP_ 기/ETN_ 바라/VV_ ㅂ니다/EF_ ./SF_ 성 심 껏/MAG_ 돕/VV_ 아/EC_ 드리/VX_ 겠/EP_ 습니다/EF_ ./SF_ 또/MAG_ 혹시/MAG_ 고객/NNG_ 만족/NNG_ 도/NNG_ 조사/NNG_ 전화/NNG_ 를/JKO_ 받/VV_ 으시/EP_ 면/EC_ 매우/MAG_ 동의/NNG_ 하/XSV_ ㄴ다/EF_ 로/JKB_ 칭찬/NNG_ 하/XSV_ 어/EC_ 주/VX_ 시/EP_ 어요/EF_ 조금/NNG_ 은/JX_ 쌀 쌀 하/VA_ ㄴ/ETM_ 10/SN_ 월/NNB_ 의/JKG_ 첫 주/NNG_ 이/VCP_ ㅂ니다/EF_ ./SF_ 환 절/NNG_ 기/XSN_ 감기/NNG_ 조심/NNG_ 하/XSV_ 시/EP_ 고/EC_ 따 듯하/VA_ ㄴ/ETM_ 차/NNG_ 와/JC_ 함께/MAG_ 건강/NNG_ 하/XSA_ ㄴ/ETM_ 한/MM_ 주/NNB_ 보내/VV_ 시/EP_ 기/ETN_ 바라/VV_ ㅂ니다/EF_ ./SF_ X X [SEP]\n",
-      "INFO:tensorflow:input_ids: 2 3047 3047 1496 1291 1123 2547 3047 3047 1496 994 315 1692 375 3277 11 456 9 20 129 388 22 1291 1123 3353 1308 13 588 11 276 158 7 5865 1579 101 266 9 20 4511 10 187 16 38 4506 71 80 1883 31 2597 359 388 49 2019 158 7 270 855 5181 2544 62 4971 124 116 7 179 5865 1291 2379 356 268 823 19 78 4506 71 1210 2680 9 41 31 3998 9 20 129 388 526 4380 21 9212 9212 248 10 113 60 13 4508 211 15 158 7 1807 1509 284 9869 4315 9 388 23 2577 9115 10 208 56 162 1394 42 10 92 2227 561 388 49 2019 158 7 3047 3047 3\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n",
-      "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:label: 0 (id = 0)\n"
-     ]
-    }
-   ],
-   "source": [
-    "tf.logging.info('*** Example ***')\n",
-    "tf.logging.info('guid: %s' % (example.guid))\n",
-    "tf.logging.info('tokens\" %s' % \" \".join(\n",
-    "    [printable_text(x) for x in tokens]))\n",
-    "tf.logging.info('input_ids: %s' % \" \".join([str(x) for x in input_ids]))\n",
-    "tf.logging.info('input_mask: %s' % \" \".join([str(x) for x in input_mask]))\n",
-    "tf.logging.info('segment_ids: %s' % \" \".join([str(x) for x in segment_ids]))\n",
-    "tf.logging.info('label: %s (id = %d)' % (example.label, label_id))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 217,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "feature = InputFeatures(\n",
-    "    input_ids=input_ids,\n",
-    "    input_mask=input_mask,\n",
-    "    segment_ids=segment_ids,\n",
-    "    label_id=label_id,\n",
-    "    is_real_example=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 223,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def create_int_feature(values):\n",
-    "    f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))\n",
-    "    return f"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 227,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "OrderedDict([('input_ids', int64_list {\n",
-      "  value: 2\n",
-      "  value: 3047\n",
-      "  value: 3047\n",
-      "  value: 1496\n",
-      "  value: 1291\n",
-      "  value: 1123\n",
-      "  value: 2547\n",
-      "  value: 3047\n",
-      "  value: 3047\n",
-      "  value: 1496\n",
-      "  value: 994\n",
-      "  value: 315\n",
-      "  value: 1692\n",
-      "  value: 375\n",
-      "  value: 3277\n",
-      "  value: 11\n",
-      "  value: 456\n",
-      "  value: 9\n",
-      "  value: 20\n",
-      "  value: 129\n",
-      "  value: 388\n",
-      "  value: 22\n",
-      "  value: 1291\n",
-      "  value: 1123\n",
-      "  value: 3353\n",
-      "  value: 1308\n",
-      "  value: 13\n",
-      "  value: 588\n",
-      "  value: 11\n",
-      "  value: 276\n",
-      "  value: 158\n",
-      "  value: 7\n",
-      "  value: 5865\n",
-      "  value: 1579\n",
-      "  value: 101\n",
-      "  value: 266\n",
-      "  value: 9\n",
-      "  value: 20\n",
-      "  value: 4511\n",
-      "  value: 10\n",
-      "  value: 187\n",
-      "  value: 16\n",
-      "  value: 38\n",
-      "  value: 4506\n",
-      "  value: 71\n",
-      "  value: 80\n",
-      "  value: 1883\n",
-      "  value: 31\n",
-      "  value: 2597\n",
-      "  value: 359\n",
-      "  value: 388\n",
-      "  value: 49\n",
-      "  value: 2019\n",
-      "  value: 158\n",
-      "  value: 7\n",
-      "  value: 270\n",
-      "  value: 855\n",
-      "  value: 5181\n",
-      "  value: 2544\n",
-      "  value: 62\n",
-      "  value: 4971\n",
-      "  value: 124\n",
-      "  value: 116\n",
-      "  value: 7\n",
-      "  value: 179\n",
-      "  value: 5865\n",
-      "  value: 1291\n",
-      "  value: 2379\n",
-      "  value: 356\n",
-      "  value: 268\n",
-      "  value: 823\n",
-      "  value: 19\n",
-      "  value: 78\n",
-      "  value: 4506\n",
-      "  value: 71\n",
-      "  value: 1210\n",
-      "  value: 2680\n",
-      "  value: 9\n",
-      "  value: 41\n",
-      "  value: 31\n",
-      "  value: 3998\n",
-      "  value: 9\n",
-      "  value: 20\n",
-      "  value: 129\n",
-      "  value: 388\n",
-      "  value: 526\n",
-      "  value: 4380\n",
-      "  value: 21\n",
-      "  value: 9212\n",
-      "  value: 9212\n",
-      "  value: 248\n",
-      "  value: 10\n",
-      "  value: 113\n",
-      "  value: 60\n",
-      "  value: 13\n",
-      "  value: 4508\n",
-      "  value: 211\n",
-      "  value: 15\n",
-      "  value: 158\n",
-      "  value: 7\n",
-      "  value: 1807\n",
-      "  value: 1509\n",
-      "  value: 284\n",
-      "  value: 9869\n",
-      "  value: 4315\n",
-      "  value: 9\n",
-      "  value: 388\n",
-      "  value: 23\n",
-      "  value: 2577\n",
-      "  value: 9115\n",
-      "  value: 10\n",
-      "  value: 208\n",
-      "  value: 56\n",
-      "  value: 162\n",
-      "  value: 1394\n",
-      "  value: 42\n",
-      "  value: 10\n",
-      "  value: 92\n",
-      "  value: 2227\n",
-      "  value: 561\n",
-      "  value: 388\n",
-      "  value: 49\n",
-      "  value: 2019\n",
-      "  value: 158\n",
-      "  value: 7\n",
-      "  value: 3047\n",
-      "  value: 3047\n",
-      "  value: 3\n",
-      "}\n",
-      "), ('input_mask', int64_list {\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "  value: 1\n",
-      "}\n",
-      "), ('segment_ids', int64_list {\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "  value: 0\n",
-      "}\n",
-      "), ('label_ids', int64_list {\n",
-      "  value: 0\n",
-      "}\n",
-      "), ('is_real_example', int64_list {\n",
-      "  value: 1\n",
-      "}\n",
-      ")])\n"
-     ]
-    }
-   ],
-   "source": [
-    "features = collections.OrderedDict()\n",
-    "\n",
-    "features['input_ids'] = create_int_feature(feature.input_ids)\n",
-    "features['input_mask'] = create_int_feature(feature.input_mask)\n",
-    "features['input_ids'] = create_int_feature(feature.input_ids)\n",
-    "features['segment_ids'] = create_int_feature(feature.segment_ids)\n",
-    "features['label_ids'] = create_int_feature([feature.label_id])\n",
-    "features['is_real_example'] = create_int_feature([int(feature.is_real_example)])\n",
-    "\n",
-    "print(features)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 228,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf_example = tf.train.Example(features=tf.train.Features(feature=features))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 229,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "features {\n",
-       "  feature {\n",
-       "    key: \"input_ids\"\n",
-       "    value {\n",
-       "      int64_list {\n",
-       "        value: 2\n",
-       "        value: 3047\n",
-       "        value: 3047\n",
-       "        value: 1496\n",
-       "        value: 1291\n",
-       "        value: 1123\n",
-       "        value: 2547\n",
-       "        value: 3047\n",
-       "        value: 3047\n",
-       "        value: 1496\n",
-       "        value: 994\n",
-       "        value: 315\n",
-       "        value: 1692\n",
-       "        value: 375\n",
-       "        value: 3277\n",
-       "        value: 11\n",
-       "        value: 456\n",
-       "        value: 9\n",
-       "        value: 20\n",
-       "        value: 129\n",
-       "        value: 388\n",
-       "        value: 22\n",
-       "        value: 1291\n",
-       "        value: 1123\n",
-       "        value: 3353\n",
-       "        value: 1308\n",
-       "        value: 13\n",
-       "        value: 588\n",
-       "        value: 11\n",
-       "        value: 276\n",
-       "        value: 158\n",
-       "        value: 7\n",
-       "        value: 5865\n",
-       "        value: 1579\n",
-       "        value: 101\n",
-       "        value: 266\n",
-       "        value: 9\n",
-       "        value: 20\n",
-       "        value: 4511\n",
-       "        value: 10\n",
-       "        value: 187\n",
-       "        value: 16\n",
-       "        value: 38\n",
-       "        value: 4506\n",
-       "        value: 71\n",
-       "        value: 80\n",
-       "        value: 1883\n",
-       "        value: 31\n",
-       "        value: 2597\n",
-       "        value: 359\n",
-       "        value: 388\n",
-       "        value: 49\n",
-       "        value: 2019\n",
-       "        value: 158\n",
-       "        value: 7\n",
-       "        value: 270\n",
-       "        value: 855\n",
-       "        value: 5181\n",
-       "        value: 2544\n",
-       "        value: 62\n",
-       "        value: 4971\n",
-       "        value: 124\n",
-       "        value: 116\n",
-       "        value: 7\n",
-       "        value: 179\n",
-       "        value: 5865\n",
-       "        value: 1291\n",
-       "        value: 2379\n",
-       "        value: 356\n",
-       "        value: 268\n",
-       "        value: 823\n",
-       "        value: 19\n",
-       "        value: 78\n",
-       "        value: 4506\n",
-       "        value: 71\n",
-       "        value: 1210\n",
-       "        value: 2680\n",
-       "        value: 9\n",
-       "        value: 41\n",
-       "        value: 31\n",
-       "        value: 3998\n",
-       "        value: 9\n",
-       "        value: 20\n",
-       "        value: 129\n",
-       "        value: 388\n",
-       "        value: 526\n",
-       "        value: 4380\n",
-       "        value: 21\n",
-       "        value: 9212\n",
-       "        value: 9212\n",
-       "        value: 248\n",
-       "        value: 10\n",
-       "        value: 113\n",
-       "        value: 60\n",
-       "        value: 13\n",
-       "        value: 4508\n",
-       "        value: 211\n",
-       "        value: 15\n",
-       "        value: 158\n",
-       "        value: 7\n",
-       "        value: 1807\n",
-       "        value: 1509\n",
-       "        value: 284\n",
-       "        value: 9869\n",
-       "        value: 4315\n",
-       "        value: 9\n",
-       "        value: 388\n",
-       "        value: 23\n",
-       "        value: 2577\n",
-       "        value: 9115\n",
-       "        value: 10\n",
-       "        value: 208\n",
-       "        value: 56\n",
-       "        value: 162\n",
-       "        value: 1394\n",
-       "        value: 42\n",
-       "        value: 10\n",
-       "        value: 92\n",
-       "        value: 2227\n",
-       "        value: 561\n",
-       "        value: 388\n",
-       "        value: 49\n",
-       "        value: 2019\n",
-       "        value: 158\n",
-       "        value: 7\n",
-       "        value: 3047\n",
-       "        value: 3047\n",
-       "        value: 3\n",
-       "      }\n",
-       "    }\n",
-       "  }\n",
-       "  feature {\n",
-       "    key: \"input_mask\"\n",
-       "    value {\n",
-       "      int64_list {\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "        value: 1\n",
-       "      }\n",
-       "    }\n",
-       "  }\n",
-       "  feature {\n",
-       "    key: \"is_real_example\"\n",
-       "    value {\n",
-       "      int64_list {\n",
-       "        value: 1\n",
-       "      }\n",
-       "    }\n",
-       "  }\n",
-       "  feature {\n",
-       "    key: \"label_ids\"\n",
-       "    value {\n",
-       "      int64_list {\n",
-       "        value: 0\n",
-       "      }\n",
-       "    }\n",
-       "  }\n",
-       "  feature {\n",
-       "    key: \"segment_ids\"\n",
-       "    value {\n",
-       "      int64_list {\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "        value: 0\n",
-       "      }\n",
-       "    }\n",
-       "  }\n",
-       "}"
-      ]
-     },
-     "execution_count": 229,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tf_example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 230,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "writer.write(tf_example.SerializeToString())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 231,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "writer.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 232,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'./output_dir/smishing/train.tf_record'"
-      ]
-     },
-     "execution_count": 232,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "train_file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 233,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:***** Running training *****\n",
-      "INFO:tensorflow:  Num examples = 100\n",
-      "INFO:tensorflow:  Batch size = 32\n",
-      "INFO:tensorflow:  Num steps = 9\n"
-     ]
-    }
-   ],
-   "source": [
-    "tf.logging.info(\"***** Running training *****\")\n",
-    "tf.logging.info(\"  Num examples = %d\", len(train_examples))\n",
-    "tf.logging.info(\"  Batch size = %d\", FLAGS.train_batch_size)\n",
-    "tf.logging.info(\"  Num steps = %d\", num_train_steps)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Spacing안한 전체 데이터로 돌려보기!"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "1. 데이터 준비"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 236,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "processor = SmishingProcessor()\n",
-    "label_list = processor.get_labels()\n",
-    "\n",
-    "# get train samples\n",
-    "train_examples = processor.get_train_examples(dacon_path, 'train.tsv')\n",
-    "num_train_steps = int(\n",
-    "    len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)\n",
-    "num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 237,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x00000219A36F36A8>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: C:\\Users\\jinma\\AppData\\Local\\Temp\\tmpafvhq326\n",
-      "INFO:tensorflow:Using config: {'_model_dir': 'C:\\\\Users\\\\jinma\\\\AppData\\\\Local\\\\Temp\\\\tmpafvhq326', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n",
-      "graph_options {\n",
-      "  rewrite_options {\n",
-      "    meta_optimizer_iterations: ONE\n",
-      "  }\n",
-      "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002199E5846D8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None, eval_training_input_configuration=2, experimental_host_call_every_n_steps=1), '_cluster': None}\n",
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
-      "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
-     ]
-    }
-   ],
-   "source": [
-    "# record ETRI model weights\n",
-    "FLAGS.init_checkpoint = path + 'model.ckpt'\n",
-    "\n",
-    "model_fn = model_fn_builder(\n",
-    "    bert_config=bert_config,\n",
-    "    num_labels=len(label_list),             # 2\n",
-    "    init_checkpoint=FLAGS.init_checkpoint,  # None\n",
-    "    learning_rate=FLAGS.learning_rate,      # 5e-05\n",
-    "    num_train_steps=num_train_steps,        # 22195\n",
-    "    num_warmup_steps=num_warmup_steps,      # 2219\n",
-    "    use_tpu=FLAGS.use_tpu,                  # False\n",
-    "    use_one_hot_embeddings=FLAGS.use_tpu)   # False\n",
-    "\n",
-    "# If TPU is not available, this will fall back to normal Estimator on CPU\n",
-    "# or GPU\n",
-    "estimator = tf.contrib.tpu.TPUEstimator(\n",
-    "    use_tpu=FLAGS.use_tpu,                        # False\n",
-    "    model_fn=model_fn,\n",
-    "    config=run_config,\n",
-    "    train_batch_size=FLAGS.train_batch_size,      # 32\n",
-    "    eval_batch_size=FLAGS.eval_batch_size,        # 8\n",
-    "    predict_batch_size=FLAGS.predict_batch_size   # 8\n",
-    ")\n",
-    "\n",
-    "FLAGS.output_dir = './output_dir/smishing/'\n",
-    "\n",
-    "tf.gfile.MakeDirs(FLAGS.output_dir)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 240,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def file_based_convert_examples_to_features(\n",
-    "    examples, label_lsit, max_seq_length, tokenizer, output_file):\n",
-    "    \n",
-    "    writer = tf.python_io.TFRecordWriter(output_file)\n",
-    "\n",
-    "    for (ex_index, example) in enumerate(examples):\n",
-    "        if ex_index % 10000 == 0:\n",
-    "            tf.logging.info(\"Writing example %d of %d\" % (ex_index, len(examples)))\n",
-    "\n",
-    "        feature = convert_single_example(ex_index, example, label_list,\n",
-    "                                         max_seq_length, tokenizer)\n",
-    "\n",
-    "        def create_int_feature(values):\n",
-    "            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))\n",
-    "            return f\n",
-    "\n",
-    "        features = collections.OrderedDict()\n",
-    "        features[\"input_ids\"] = create_int_feature(feature.input_ids)\n",
-    "        features[\"input_mask\"] = create_int_feature(feature.input_mask)\n",
-    "        features[\"segment_ids\"] = create_int_feature(feature.segment_ids)\n",
-    "        features[\"label_ids\"] = create_int_feature([feature.label_id])\n",
-    "        features[\"is_real_example\"] = create_int_feature(\n",
-    "            [int(feature.is_real_example)])\n",
-    "\n",
-    "        tf_example = tf.train.Example(features=tf.train.Features(feature=features))\n",
-    "        writer.write(tf_example.SerializeToString())\n",
-    "    writer.close()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 244,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_file = os.path.join(FLAGS.output_dir, 'train_non_spacing.tf_record')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "file_based_convert_examples_to_features"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "basic",
-   "language": "python",
-   "name": "basic"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/src/advanced_transformers/__init__.py b/src/advanced_transformers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/activations.py b/src/advanced_transformers/activations.py
new file mode 100644
index 0000000..d9ba0c6
--- /dev/null
+++ b/src/advanced_transformers/activations.py
@@ -0,0 +1,416 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+
+import torch
+from packaging import version
+from torch import Tensor, nn
+
+from .quant_modules import (
+    symmetric_linear_quantization_params,
+    SymmetricQuantFunction,
+    floor_ste,
+    FixedPointMul,
+)
+
+
+# Copied from transformers.activations.NewGELUActivation
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+# Copied from transformers.activations.GELUActivation
+class GELUActivation(nn.Module):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if version.parse(torch.__version__) < version.parse("1.4") or use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+# Copied from transformers.activations.FastGELUActivation
+class FastGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate.
+    See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+
+
+# Copied from transformers.activations.QuickGELUActivation
+class QuickGELUActivation(nn.Module):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate.
+    See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input * torch.sigmoid(1.702 * input)
+
+
+# Copied from transformers.activations.ClippedGELUActivation
+class ClippedGELUActivation(nn.Module):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x: Tensor) -> Tensor:
+        return torch.clip(gelu(x), self.min, self.max)
+
+
+# Copied from transformers.models.bloom.modeling_bloom.bloom_gelu_forward
+def bloom_gelu_forward(x):
+    """
+    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
+    make the model jitable.
+    Args:
+        x (`torch.tensor`, *required*):
+            input hidden states
+    """
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+# Copied from transformers.models.bloom.modeling_bloom.bloom_gelu_back
+def bloom_gelu_back(g, x):
+    """
+    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) +
+    0.3989423 * x * torch.exp(-0.5 * x * x)
+    Args:
+        g (`torch.tensor`, *required*):
+            gradient output tensor
+        x (`torch.tensor`, *required*):
+            input tensor
+    """
+    x = x[0]  # x is a tuple of 1 element, needs to unpack it first
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff * g
+
+
+# Copied from transformers.models.bloom.modeling_bloom.GeLUFunction
+class GeLUFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return bloom_gelu_forward(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors
+        tmp = bloom_gelu_back(grad_output, input)
+        return tmp
+
+
+# Copied from transformers.models.bloom.modeling_bloom.BloomGelu
+class BloomGELUActivation(nn.Module):
+    """
+    BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
+    torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
+    copied from Megatron-DeepSpeed code and adapted for our needs
+    See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        if self.training:
+            return GeLUFunction.apply(x)
+        else:
+            bloom_gelu_forward(x)
+
+
+# Inspired by transformers.models.ibert.quant_modules.IntGELU
+class IntGELU(nn.Module):
+    """
+    Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.
+    Args:
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+    """
+
+    def __init__(self, quant_mode=True):
+        super().__init__()
+        self.quant_mode = quant_mode
+
+        if not self.quant_mode:
+            self.activation_fn = nn.GELU()
+
+        self.k = 1.4142
+        self.const = 14  # dummy integer constant
+        self.coeff = [-0.2888, -1.769, 1]  # a(x+b)**2 + c
+        self.coeff[2] /= self.coeff[0]
+
+    def int_erf(self, x_int, scaling_factor):
+        b_int = torch.floor(self.coeff[1] / scaling_factor)
+        c_int = torch.floor(self.coeff[2] / scaling_factor**2)
+        sign = torch.sign(x_int)
+
+        abs_int = torch.min(torch.abs(x_int), -b_int)
+        y_int = sign * ((abs_int + b_int) ** 2 + c_int)
+        scaling_factor = scaling_factor**2 * self.coeff[0]
+
+        # avoid overflow
+        y_int = floor_ste.apply(y_int / 2**self.const)
+        scaling_factor = scaling_factor * 2**self.const
+
+        return y_int, scaling_factor
+
+    def forward(self, x, scaling_factor=None):
+        if not self.quant_mode:
+            return self.activation_fn(x), None
+
+        x_int = x / scaling_factor
+        sigmoid_int, sigmoid_scaling_factor = self.int_erf(x_int, scaling_factor / self.k)
+
+        shift_int = 1.0 // sigmoid_scaling_factor
+
+        x_int = x_int * (sigmoid_int + shift_int)
+        scaling_factor = scaling_factor * sigmoid_scaling_factor / 2
+
+        return x_int * scaling_factor, scaling_factor
+
+
+# Copied from transformers.models.ibert.quant_modules.QuantAct
+class QuantAct(nn.Module):
+    """
+    Quantizes the given activation.
+    Args:
+        activation_bit (`int`):
+            Bitwidth for the quantized activation.
+        act_range_momentum (`float`, *optional*, defaults to `0.95`):
+            Momentum for updating the activation quantization range.
+        per_channel (`bool`, *optional*, defaults to `False`):
+            Whether to or not use channel-wise quantization.
+        channel_len (`int`, *optional*):
+            Specify the channel length when set the *per_channel* True.
+        quant_mode (`bool`, *optional*, defaults to `False`):
+            Whether or not the layer is quantized.
+    """
+
+    def __init__(
+        self,
+        activation_bit,
+        act_range_momentum=0.95,
+        per_channel=False,
+        channel_len=None,
+        quant_mode=False
+    ):
+        super().__init__()
+
+        self.activation_bit = activation_bit
+        self.act_range_momentum = act_range_momentum
+        self.quant_mode = quant_mode
+        self.per_channel = per_channel
+        self.percentile = False
+        self.act_function = SymmetricQuantFunction.apply
+
+        if not self.per_channel:
+            self.register_buffer("x_min", torch.zeros(1))
+            self.register_buffer("x_max", torch.zeros(1))
+            self.register_buffer("act_scaling_factor", torch.zeros(1))
+            self.x_min -= 1e-5
+            self.x_max += 1e-5
+        else:
+            raise NotImplementedError("per-channel mode is not currently supported for activation.")
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}(activation_bit={self.activation_bit}, "
+            f"quant_mode: {self.quant_mode}, Act_min: {self.x_min.item():.2f}, "
+            f"Act_max: {self.x_max.item():.2f})"
+        )
+
+    def forward(
+        self,
+        x,
+        pre_act_scaling_factor=None,
+        identity=None,
+        identity_scaling_factor=None,
+        specified_min=None,
+        specified_max=None,
+    ):
+
+        x_act = x if identity is None else identity + x
+        # collect running stats if training
+        if self.training:
+            assert not self.percentile, "percentile mode is not currently supported for activation."
+            assert not self.per_channel, "per-channel mode is not currently supported for activation."
+            x_min = x_act.data.min()
+            x_max = x_act.data.max()
+
+            assert (
+                x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0
+            ), "NaN detected when computing min/max of the activation"
+
+            # Initialization
+            if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5:
+                self.x_min = self.x_min + x_min
+                self.x_max = self.x_max + x_max
+
+            # exponential moving average (EMA)
+            # use momentum to prevent the quantized values change greatly every iteration
+            elif self.act_range_momentum == -1:
+                self.x_min = torch.min(self.x_min, x_min)
+                self.x_max = torch.max(self.x_max, x_max)
+            else:
+                self.x_min = self.x_min * self.act_range_momentum + x_min * (1 - self.act_range_momentum)
+                self.x_max = self.x_max * self.act_range_momentum + x_max * (1 - self.act_range_momentum)
+
+        if not self.quant_mode:
+            return x_act, None
+
+        x_min = self.x_min if specified_min is None else specified_min
+        x_max = self.x_max if specified_max is None else specified_max
+
+        self.act_scaling_factor = symmetric_linear_quantization_params(
+            self.activation_bit, x_min, x_max, per_channel=self.per_channel
+        )
+
+        if pre_act_scaling_factor is None:
+            # this is for the input quantization
+            quant_act_int = self.act_function(x, self.activation_bit, self.percentile, self.act_scaling_factor)
+        else:
+            quant_act_int = FixedPointMul.apply(
+                x,
+                pre_act_scaling_factor,
+                self.activation_bit,
+                self.act_scaling_factor,
+                identity,
+                identity_scaling_factor,
+            )
+
+        correct_output_scale = self.act_scaling_factor.view(-1)
+
+        return quant_act_int * correct_output_scale, self.act_scaling_factor
+
+
+# Copied from transformers.activations.SiLUActivation
+class SiLUActivation(nn.Module):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+
+    def __init__(self):
+        super().__init__()
+        if version.parse(torch.__version__) < version.parse("1.7"):
+            self.act = self._silu_python
+        else:
+            self.act = nn.functional.silu
+
+    def _silu_python(self, input: Tensor) -> Tensor:
+        return input * torch.sigmoid(input)
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+# Copied from transformers.activations.MishActivation
+class MishActivation(nn.Module):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+
+    def __init__(self):
+        super().__init__()
+        if version.parse(torch.__version__) < version.parse("1.9"):
+            self.act = self._mish_python
+        else:
+            self.act = nn.functional.mish
+
+    def _mish_python(self, input: Tensor) -> Tensor:
+        return input * torch.tanh(nn.functional.softplus(input))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+# Copied from transformers.activations.LinearActivation
+class LinearActivation(nn.Module):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+ACT2FN = {
+    "gelu": GELUActivation(),
+    "gelu_10": ClippedGELUActivation(-10, 10),
+    "gelu_fast": FastGELUActivation(),
+    "gelu_new": NewGELUActivation(),
+    "gelu_python": GELUActivation(use_gelu_python=True),
+    "linear": LinearActivation(),
+    "mish": MishActivation(),
+    "quick_gelu": QuickGELUActivation(),
+    "gelu_bloom": BloomGELUActivation(),
+    "relu": nn.ReLU(),
+    "sigmoid": nn.Sigmoid(),
+    "silu": SiLUActivation(),
+    "swish": SiLUActivation(),
+    "tanh": nn.Tanh(),
+    "gelu_int": IntGELU(quant_mode=False),
+    "gelu_int_quant": IntGELU(),
+    "act_8": QuantAct(activation_bit=8),
+    "act_8_quant": QuantAct(activation_bit=8, quant_mode=True),
+    "act_16": QuantAct(activation_bit=16),
+    "act_16_quant": QuantAct(activation_bit=16, quant_mode=True),
+    "act_22": QuantAct(activation_bit=22),
+    "act_22_quant": QuantAct(activation_bit=22, quant_mode=True),
+}
\ No newline at end of file
diff --git a/src/advanced_transformers/processors/__init__.py b/src/advanced_transformers/processors/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/quant_modules.py b/src/advanced_transformers/quant_modules.py
new file mode 100644
index 0000000..041ff2e
--- /dev/null
+++ b/src/advanced_transformers/quant_modules.py
@@ -0,0 +1,281 @@
+# coding=utf-8
+# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao,
+# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team.
+# Copyright (c) 20121, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import decimal
+
+import numpy as np
+import torch
+from torch import nn
+from torch.autograd import Function
+
+
+# Copied from transformers.models.ibert.quant_modules.linear_quantize
+def linear_quantize(input, scale, zero_point, inplace=False):
+    """
+    Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.
+    Args:
+        input (`torch.Tensor`):
+            Single-precision input tensor to be quantized.
+        scale (`torch.Tensor`):
+            Scaling factor for quantization.
+        zero_pint (`torch.Tensor`):
+            Shift for quantization.
+        inplace (`bool`, *optional*, defaults to `False`):
+            Whether to compute inplace or not.
+    Returns:
+        `torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
+    """
+    # reshape scale and zeropoint for convolutional weights and activation
+    if len(input.shape) == 4:
+        scale = scale.view(-1, 1, 1, 1)
+        zero_point = zero_point.view(-1, 1, 1, 1)
+    # reshape scale and zeropoint for linear weights
+    elif len(input.shape) == 2:
+        scale = scale.view(-1, 1)
+        zero_point = zero_point.view(-1, 1)
+    else:
+        scale = scale.view(-1)
+        zero_point = zero_point.view(-1)
+    # quantized = float / scale + zero_point
+    if inplace:
+        input.mul_(1.0 / scale).add_(zero_point).round_()
+        return input
+    return torch.round(1.0 / scale * input + zero_point)
+
+
+# Copied from transformers.models.ibert.quant_modules.symmetric_linear_quantization_params
+def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_max, per_channel=False):
+    """
+    Compute the scaling factor with the given quantization range for symmetric quantization.
+    Args:
+        saturation_min (`torch.Tensor`):
+            Lower bound for quantization range.
+        saturation_max (`torch.Tensor`):
+            Upper bound for quantization range.
+        per_channel (`bool`, *optional*, defaults to `False`):
+            Whether to or not use channel-wise quantization.
+    Returns:
+        `torch.Tensor`: Scaling factor that linearly quantizes the given range between *saturation_min* and
+        *saturation_max*.
+    """
+    # in this part, we do not need any gradient computation,
+    # in order to enforce this, we put torch.no_grad()
+    with torch.no_grad():
+        n = 2 ** (num_bits - 1) - 1
+
+        if per_channel:
+            scale, _ = torch.max(torch.stack([saturation_min.abs(), saturation_max.abs()], dim=1), dim=1)
+            scale = torch.clamp(scale, min=1e-8) / n
+
+        else:
+            scale = max(saturation_min.abs(), saturation_max.abs())
+            scale = torch.clamp(scale, min=1e-8) / n
+
+    return scale
+
+
+# Copied from transformers.models.ibert.quant_modules.SymmetricQuantFunction
+class SymmetricQuantFunction(Function):
+    """
+    Class to quantize the given floating-point values using symmetric quantization with given range and bitwidth.
+    """
+
+    @staticmethod
+    def forward(ctx, x, k, percentile_mode, scale):
+        """
+        Args:
+            x (`torch.Tensor`):
+                Floating point tensor to be quantized.
+            k (`int`):
+                Quantization bitwidth.
+            percentile_mode (`bool`):
+                Whether or not to use percentile calibration.
+            scale (`torch.Tensor`):
+                Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
+                requires pre-calculated scaling factor.
+        Returns:
+            `torch.Tensor`: Symmetric-quantized value of *input*.
+        """
+        zero_point = torch.tensor(0.0).to(scale.device)
+
+        n = 2 ** (k - 1) - 1
+        new_quant_x = linear_quantize(x, scale, zero_point, inplace=False)
+        new_quant_x = torch.clamp(new_quant_x, -n, n - 1)
+
+        ctx.scale = scale
+        return new_quant_x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        scale = ctx.scale
+        if len(grad_output.shape) == 4:
+            scale = scale.view(-1, 1, 1, 1)
+        # reshape scale and zeropoint for linear weights
+        elif len(grad_output.shape) == 2:
+            scale = scale.view(-1, 1)
+        else:
+            scale = scale.view(-1)
+
+        return grad_output.clone() / scale, None, None, None, None
+
+
+# Copied from transformers.models.ibert.quant_modules.floor_ste
+class floor_ste(Function):
+    """
+    Straight-through Estimator(STE) for torch.floor()
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        return torch.floor(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output.clone()
+
+
+# Copied from transformers.models.ibert.quant_modules.round_ste
+class round_ste(Function):
+    """
+    Straight-through Estimator(STE) for torch.round()
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        return torch.round(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return grad_output.clone()
+
+
+# Copied from transformers.models.ibert.quant_modules.batch_frexp
+def batch_frexp(inputs, max_bit=31):
+    """
+    Decompose the scaling factor into mantissa and twos exponent.
+    Args:
+        scaling_factor (`torch.Tensor`):
+            Target scaling factor to decompose.
+    Returns:
+        ``Tuple(torch.Tensor, torch.Tensor)`: mantisa and exponent
+    """
+
+    shape_of_input = inputs.size()
+
+    # trans the input to be a 1-d tensor
+    inputs = inputs.view(-1)
+
+    output_m, output_e = np.frexp(inputs.cpu().numpy())
+    tmp_m = []
+    for m in output_m:
+        int_m_shifted = int(
+            decimal.Decimal(m * (2**max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP)
+        )
+        tmp_m.append(int_m_shifted)
+    output_m = np.array(tmp_m)
+
+    output_e = float(max_bit) - output_e
+
+    return (
+        torch.from_numpy(output_m).to(inputs.device).view(shape_of_input),
+        torch.from_numpy(output_e).to(inputs.device).view(shape_of_input),
+    )        
+
+
+# Copied from transformers.models.ibert.quant_modules.FixedPointMul
+class FixedPointMul(Function):
+    """
+    Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.
+    Args:
+        pre_act (`torch.Tensor`):
+            Input tensor.
+        pre_act_scaling_factor (`torch.Tensor`):
+            Scaling factor of the input tensor *pre_act*.
+        bit_num (`int`):
+            Quantization bitwidth.
+        z_scaling_factor (`torch.Tensor`):
+            Scaling factor of the output tensor.
+        identity (`torch.Tensor`, *optional*):
+            Identity tensor, if exists.
+        identity_scaling_factor (`torch.Tensor`, *optional*):
+            Scaling factor of the identity tensor *identity*, if exists.
+    Returns:
+        `torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act* and
+        *identity*), whose scale is rescaled to *z_scaling_factor*.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        pre_act,
+        pre_act_scaling_factor,
+        bit_num,
+        z_scaling_factor,
+        identity=None,
+        identity_scaling_factor=None,
+    ):
+
+        if len(pre_act_scaling_factor.shape) == 3:
+            reshape = lambda x: x  # noqa: E731
+        else:
+            reshape = lambda x: x.view(1, 1, -1)  # noqa: E731
+        ctx.identity = identity
+
+        n = 2 ** (bit_num - 1) - 1
+
+        with torch.no_grad():
+            pre_act_scaling_factor = reshape(pre_act_scaling_factor)
+            if identity is not None:
+                identity_scaling_factor = reshape(identity_scaling_factor)
+
+            ctx.z_scaling_factor = z_scaling_factor
+
+            z_int = torch.round(pre_act / pre_act_scaling_factor)
+            _A = pre_act_scaling_factor.type(torch.double)
+            _B = (z_scaling_factor.type(torch.float)).type(torch.double)
+            new_scale = _A / _B
+            new_scale = reshape(new_scale)
+
+            m, e = batch_frexp(new_scale)
+
+            output = z_int.type(torch.double) * m.type(torch.double)
+            output = torch.round(output / (2.0**e))
+
+            if identity is not None:
+                # needs addition of identity activation
+                wx_int = torch.round(identity / identity_scaling_factor)
+
+                _A = identity_scaling_factor.type(torch.double)
+                _B = (z_scaling_factor.type(torch.float)).type(torch.double)
+                new_scale = _A / _B
+                new_scale = reshape(new_scale)
+
+                m1, e1 = batch_frexp(new_scale)
+                output1 = wx_int.type(torch.double) * m1.type(torch.double)
+                output1 = torch.round(output1 / (2.0**e1))
+
+                output = output1 + output
+
+            return torch.clamp(output.type(torch.float), -n - 1, n)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        identity_grad = None
+        if ctx.identity is not None:
+            identity_grad = grad_output.clone() / ctx.z_scaling_factor
+        return grad_output.clone() / ctx.z_scaling_factor, None, None, None, None, identity_grad, None
\ No newline at end of file
diff --git a/src/advanced_transformers/retrievers/__init__.py b/src/advanced_transformers/retrievers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/__init__.py b/src/advanced_transformers/transformers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/__init__.py b/src/advanced_transformers/transformers/components/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/attentions/__init__.py b/src/advanced_transformers/transformers/components/attentions/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/feed_forward_networks/__init__.py b/src/advanced_transformers/transformers/components/feed_forward_networks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/heads/__init__.py b/src/advanced_transformers/transformers/components/heads/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/layer_norms/__init__.py b/src/advanced_transformers/transformers/components/layer_norms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/__init__.py b/src/advanced_transformers/transformers/components/positional_embeddings/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/attention_with_linear_biases.py b/src/advanced_transformers/transformers/components/positional_embeddings/attention_with_linear_biases.py
new file mode 100644
index 0000000..5da5abe
--- /dev/null
+++ b/src/advanced_transformers/transformers/components/positional_embeddings/attention_with_linear_biases.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import math
+from typing import List
+
+import torch
+
+
+def get_slopes(n: int) -> List[int]:
+    def get_slopes_power_of_2(n: int):
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+        ratio = start
+        return [start * ratio ** i for i in range(n)]
+
+    # In the paper, we only train models that have 2^a heads for some a.
+    # This function has some good properties that only occur when the input is a power of 2.
+    # To maintain that even when the number of heads is not a power of 2, we use this workaround.
+    if math.log2(n).is_integer():
+        return get_slopes_power_of_2(n)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(n))
+        return (
+            get_slopes_power_of_2(closest_power_of_2)
+            + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+        )
+
+
+if __name__ == "__main__":
+    # __init__
+    bsz = 32
+    seq_len = 17
+    max_tokens = 512
+    maxpos = 512  # tokens_per_sample
+    attn_heads = 16  # decoder_attention_heads
+    slopes = torch.Tensor(get_slopes(attn_heads))
+    # In the next line, the part after the * is what constructs the diagonal matrix
+    # (right matrix in Figure 3 in the paper).
+    # If you run it you'll see that it doesn't exactly print out the same matrix as we have in Figure 3,
+    # but one where all rows are identical.
+    # This works because the softmax operation is invariant to translation,
+    # and our bias functions are always linear.
+    m = slopes.unsqueeze(1).unsqueeze(1)  # head-specific slope fixed
+    positions = (
+        torch.arange(maxpos).unsqueeze(0).unsqueeze(0).expand(attn_heads, -1, -1)
+    )
+    alibi = m * positions  # non-learned bias
+    alibi = alibi.view(attn_heads, 1, maxpos)
+    alibi = alibi.repeat(max_tokens // maxpos, 1, 1)  # batch_size, 1, 1
+    # extract_features_scriptable
+    # we move the mask construction `before layer operation` because its slightly more efficient
+    # self_attn_mask = self.buffered_future_mask(x)
+
+    def fill_with_neg_inf(t):
+        """FP16-compatible function that fills a tensor with -inf."""
+        return t.float().fill_(float("-inf")).type_as(t)
+
+    _future_mask = torch.triu(fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1)
+    _future_mask = _future_mask + alibi
+    _future_mask = _future_mask[: bsz * attn_heads, :seq_len, :seq_len]
diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/axial_positional_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/axial_positional_embedding.py
new file mode 100644
index 0000000..946d385
--- /dev/null
+++ b/src/advanced_transformers/transformers/components/positional_embeddings/axial_positional_embedding.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright @lucidrains
+# ref. https://github.com/lucidrains/axial-positional-embedding
+
+import torch
+from torch import nn
+from operator import mul
+from functools import reduce
+
+
+class AxialPositionalEmbedding(nn.Module):
+    def __init__(self, dim, axial_shape, axial_dims = None):
+        super().__init__()
+
+        self.dim = dim
+        self.shape = axial_shape
+        self.max_seq_len = reduce(mul, axial_shape, 1)
+
+        self.summed = axial_dims is None
+        axial_dims = ((dim,) * len(axial_shape)) if self.summed else axial_dims
+
+        assert len(self.shape) == len(axial_dims), 'number of axial dimensions must equal the number of dimensions in the shape'
+        assert self.summed or not self.summed and sum(axial_dims) == dim, f'axial dimensions must sum up to the target dimension {dim}'
+
+        self.weights = ParameterList(self, 'weights', len(axial_shape))
+
+        for ind, (shape, axial_dim) in enumerate(zip(self.shape, axial_dims)):
+            ax_shape = [1] * len(self.shape)
+            ax_shape[ind] = shape
+            ax_shape = (1, *ax_shape, axial_dim)
+            ax_emb = nn.Parameter(torch.zeros(ax_shape).normal_(0, 1))
+            self.weights.append(ax_emb)
+
+    def forward(self, x):
+        b, t, e = x.shape
+        assert (t <= self.max_seq_len), f'Sequence length ({t}) must be less than the maximum sequence length allowed ({self.max_seq_len})'
+        embs = []
+
+        for ax_emb in self.weights.to_list():
+            axial_dim = ax_emb.shape[-1]
+            expand_shape = (b, *self.shape, axial_dim)
+            emb = ax_emb.expand(expand_shape).reshape(b, self.max_seq_len, axial_dim)
+            embs.append(emb)
+
+        pos_emb = sum(embs) if self.summed else torch.cat(embs, dim=-1)
+        return pos_emb[:, :t].to(x)
+
+# a mock parameter list object until below issue is resolved
+# https://github.com/pytorch/pytorch/issues/36035
+class ParameterList(object):
+    def __init__(self, kls, prefix, length):
+        self.ind = 0
+        self.kls = kls
+        self.prefix = prefix
+        self.length = length
+
+    def _keyname(self, prefix, ind):
+        return f'{prefix}_{ind}'
+
+    def append(self, x):
+        setattr(self.kls, self._keyname(self.prefix, self.ind), x)
+        self.ind += 1
+
+    def to_list(self):
+        return [getattr(self.kls, self._keyname(self.prefix, i)) for i in range(self.length)]
+
+# Axial Positional Embedding for Images
+
+class AxialPositionalEmbeddingImage(nn.Module):
+    def __init__(self, dim, axial_shape, axial_dims = None):
+        super().__init__()
+        assert len(axial_shape) == 2, 'Axial shape must have 2 dimensions for images'
+        self.pos_emb = AxialPositionalEmbedding(dim, axial_shape, axial_dims)
+
+    def forward(self, img):
+        b, c, h, w = img.shape
+        img = img.permute(0, 2, 3, 1).reshape(b, h * w, c)
+        pos_emb = self.pos_emb(img)
+        return pos_emb.reshape(b, h, w, c).permute(0, 3, 1, 2)
diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/relative_position_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/relative_position_embedding.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/rotary_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/rotary_embedding.py
new file mode 100644
index 0000000..537fe54
--- /dev/null
+++ b/src/advanced_transformers/transformers/components/positional_embeddings/rotary_embedding.py
@@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright @lucidrains
+# ref. https://github.com/lucidrains/rotary-embedding-torch
+
+from inspect import isfunction
+
+import torch
+from torch import nn, einsum
+from einops import rearrange, repeat
+
+
+class RotaryEmbedding(nn.Module):
+    def __init__(self, theta: int, dim: int, learned_freq: bool):
+        super().__init__()
+        self.theta = theta
+        self.dim = dim
+        self.learned_freq = learned_freq
+        freqs = 1.0 / (theta ** torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+        self.cache = dict()
+
+        if learned_freq:
+            self.freqs = nn.Parameter(freqs)
+        else:
+            self.register_buffer("freqs", freqs)
+
+    def forward(self, t, cache_key=None):
+        if cache_key is not None and cache_key in self.cache:
+            return self.cache[cache_key]
+
+        if isfunction(t):
+            t = t()
+
+        freqs = self.freqs
+
+        freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+
+        if cache_key is not None:
+            self.cache[cache_key] = freqs
+
+        return freqs
+
+    @staticmethod
+    def apply_rotary_emb(freqs, t, start_index=0):
+        rot_dim = freqs.shape[-1]
+        end_index = start_index + rot_dim
+        assert rot_dim <= t.shape[-1], (
+            f"feature dimension {t.shape[-1]} is not of sufficient "
+            f"size to rotate in all the positions {rot_dim}"
+        )
+        t_left = t[..., :start_index]
+        t = t[..., start_index:end_index]
+        t_right = t[..., end_index:]
+
+        def rotary_half(x):
+            x = rearrange(x, "... (d r) -> ... d r", r=2)
+            x1, x2 = x.unbind(dim=-1)
+            x = torch.stack((-x2, x1), dim=-1)
+            return rearrange(x, "... d r -> ... (d r)")
+
+        t = (t * freqs.cos()) + (rotary_half(t) * freqs.sin())
+        return torch.cat((t_left, t, t_right), dim=-1)
diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/sinusoidal_positional_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/sinusoidal_positional_embedding.py
new file mode 100644
index 0000000..a33ea2e
--- /dev/null
+++ b/src/advanced_transformers/transformers/components/positional_embeddings/sinusoidal_positional_embedding.py
@@ -0,0 +1,94 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+
+import math
+from typing import Any, Optional
+
+import torch
+from torch import Tensor, nn
+
+
+class SinusoidalPositionalEmbedding(nn.Embedding):
+    """
+    This module produces sinusoidal positional embeddings of any length.
+    We don't want to save the weight of this embedding since it's not trained (deterministic)
+    and it can be huge. Padding symbols are ignored.
+    These embeddings get automatically extended in forward if more positions is needed.
+    """
+
+    def __init__(self, num_positions, embedding_dim, padding_idx):
+        self.make_weight(num_positions, embedding_dim, padding_idx)
+
+    def make_weight(self, num_positions, embedding_dim, padding_idx):
+        weight = self.get_embedding(num_positions, embedding_dim, padding_idx)
+        if not hasattr(self, "weight"):
+            # in ___init__
+            super().__init__(num_positions, embedding_dim, padding_idx, _weight=weight)
+        else:
+            # in forward put the weights on the correct dtype and device of the param
+            weight = weight.to(dtype=self.weight.dtype, device=self.weight.device)
+            self.weight = nn.Parameter(weight)
+        self.weight.detach_()
+        self.weight.requires_grad = False
+
+    @staticmethod
+    def get_embedding(num_embeddings, embedding_dim, padding_idx):
+        """
+        Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor,
+        but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
+            1
+        ) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
+            num_embeddings, -1
+        )
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+
+    @staticmethod
+    def make_positions(tensor, padding_idx: int):
+        """
+        Replace non-padding symbols with their position numbers.
+        Position numbers begin at padding_idx+1. Padding symbols are ignored.
+        """
+        # The series of casts and type-conversions here are carefully
+        # balanced to both work with ONNX export and XLA. In particular XLA
+        # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+        # how to handle the dtype kwarg in cumsum.
+        mask = tensor.ne(padding_idx).int()
+        return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
+
+    def forward(
+        self,
+        input,
+        incremental_state: Optional[Any] = None,
+        timestep: Optional[Tensor] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input.shape[:2]
+        max_pos = self.padding_idx + 1 + seq_len
+
+        if max_pos > self.weight.size(0):
+            # expand embeddings if needed
+            self.make_weight(max_pos, self.embedding_dim, self.padding_idx)
+
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            return self.weight[self.padding_idx + pos, :].expand(bsz, 1, -1)
+
+        positions = self.make_positions(input, self.padding_idx)
+        # `super().forward` is
+        # (self.weight.index_select(0, positions.view(-1))
+        #  .view(bsz, seq_len, -1).detach())
+        return super().forward(positions)
diff --git a/src/advanced_transformers/transformers/components/residual_connections/__init__.py b/src/advanced_transformers/transformers/components/residual_connections/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tokenization.py b/tokenization.py
deleted file mode 100644
index 2e75e3a..0000000
--- a/tokenization.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#	 http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# 형태소분석 기반 BERT를 위한 Tokenization Class
-# 수정: joonho.lim
-# 일자: 2019-05-23
-#
-# 주석 및 새롭게 코드 수정
-# 작성자: MyungHoon Jin
-
-import collections
-import re
-import unicodedata
-import six
-import tensorflow as tf
-
-def convert_to_unicode(text):
-    # Python version이 3.x일 때,
-    # type(text)이 `bytes`일 경우, utf-8로 변환
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    # Python version이 2.x일 때,
-    # type(text)이 `str`일 경우, utf-8로 변환
-    elif six.PY2:
-        if isinstance(text, str):
-            return text.decode("utf-8", "ignore")
-        elif isinstance(text, unicode):
-            return text
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    # Python 3.x, 2.x만 허용!
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-
-def printable_text(text):
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, unicode):
-            return text.encode("utf-8")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    else:
-        raise ValueError("Not running on Python2 or Python 3?")
-
-class BERTTokenizer:
-    """End 2 End Tokenizing NLU Embedding!"""
-    # from_pretrained method는 향후 추가!
-    def __init__(self, vocab_file, do_lower_case=False, max_len=None):
-        # ETRI에서 제공한 vocab file을 읽어오고
-        # 역 방향의 사전을 정의한다.
-        self.vocab = self._load_vocab(vocab_file)
-        self.inv_vocab = {v: k for k, v in self.vocab.items()}
-        # End to End Tokenizer를 구축하기 위해 아래 두 Tokenizer를 할당한다.
-        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-        self.max_len = max_len if max_len is not None else int(1e12)
-
-    def tokenize(self, text):
-        split_tokens = []
-        # End to End Tokenizing.
-        for token in self.basic_tokenizer.tokenize(text):
-            # ETRI Vocab 양식에 맞게 token 끝에 '_'를 붙여준다.
-            token += '_'
-            for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                split_tokens.append(sub_token)
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        ids = _convert_by_vocab(self.vocab, tokens)
-        if len(ids) > self.max_len:
-            raise ValueError(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len))
-        return ids
-
-    def convert_ids_to_tokens(self, ids):
-        return _convert_by_vocab(self.inv_vocab, ids)
-
-    @staticmethod
-    def _load_vocab(vocab_file):
-        # 단어 사전을 저장할 OrderedDict 객체 생성
-        vocab = collections.OrderedDict()
-        index = 0
-        with tf.io.gfile.GFile(vocab_file, 'r') as reader:
-            while True:
-                # Binary Text를 unicode(utf-8)로 decode.
-                token = convert_to_unicode(reader.readline())
-                if not token: break
-                if ((token.find('n_iters=') == 0) or
-                    (token.find('max_length=') == 0)):
-                    continue
-                token = token.split('\t')[0]
-                token = token.strip()
-                # 토큰과 해당 index를 기록
-                vocab[token] = index
-                index += 1
-        return vocab
-
-    @staticmethod
-    def _convert_by_vocab(vocab, items):
-        """Converts a sequence of [tokens|ids] using the vocab."""
-        output = []
-        for item in items:
-            output.append(vocab[item])
-        return output
-
-class BasicTokenizer:
-
-    def __init__(self, do_lower_case=True):
-        self.do_lower_case = do_lower_case
-
-    def tokenize(self, text):
-        text = convert_to_unicode(text)
-        text = self._clean_text(text)
-
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case:
-                # 현재 input으로 '고객/NNG'와 같이 Part-of-speech가 이미
-                # tagging되어있고 vocab은 '고객/NNG_'로 단어를 기록하고 있음.
-                # 여기서 `lower` 메서드를 사용하면 뒤의 tagging이 소문자로
-                # 변환되어 값의 비교를 못하게 되므로 이를 주석처리.
-
-                # token.lower()
-
-                # 모든 음절을 정준 분해시키는 함수
-                token = self._run_strip_accents(token)
-            # whitespacing이랑 다를게 무엇인지?
-            split_tokens.extend(self._run_split_on_punc(token))
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, token):
-        """Strips accents from a piece of text."""
-        token = unicodedata.normalize("NFD", token)
-        # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d
-        # 모든 음절을 정준 분해(Canonical Decomposition)시킴
-        # '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일)
-        output = []
-        for char in token:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                # unicode category가 "Mark, Nonspacing"일 경우 pass
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, token):
-        """Splits punctuation on a piece of text."""
-        chars = list(token)
-        i, start_new_word = 0, True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if self._is_punctuation(char):
-                # 공백이면 [" "]을 추가하고 새로운 단어로 시작
-                output.append([char])
-                start_new_word = True
-            else:
-                # 공백이 아닐 경우,
-                if start_new_word:
-                    # 새로운 단어로 시작할 경우에 빈 리스트 추가
-                    output.append([])
-                # 해당 문자부터 시작하도록 start_new_word는 False로 setting.
-                start_new_word = False
-                # 위에 추가한 빈 리스트에 각각 character를 채워넣음
-                output[-1].append(char)
-            i += 1
-        return ["".join(x) for x in output]
-
-
-    def _clean_text(self, text):
-        output = [] # char을 저장할 list 생성
-        for char in text:
-            # 텍스트에서 Char 단위로 출력
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or self._is_control(char):
-                # \x00이거나 �이거나 unicode cat.이 C로 시작할 경우
-                # (개행문자 제외) output에 추가하지 않는다.
-                continue
-            if self._is_whitespace(char):
-                # 공백일 경우 " "으로 output에 추가
-                output.append(" ")
-            else:
-                # 이 외의 경우 전부 output에 추가
-                output.append(char)
-        # cleaning 작업을 거친 Text를 후처리하여 반환
-        return "".join(output)
-
-    # char 단위 함수들
-    @staticmethod
-    def _is_whitespace(char):
-        if char == " " or char == '\t' or char == '\n' or char == '\r':
-            # 개행문자이거나 띄어쓰기면 True 반환
-            return True
-        cat = unicodedata.category(char)
-        if cat == 'Zs':
-            # unicode category가 Space Seperator면 True 반환
-            # https://www.compart.com/en/unicode/category/Zs
-            return True
-        # 이 외의 경우 전부 False 반환
-        return False
-
-    @staticmethod
-    def _is_control(char):
-        if char == "\t" or char == "\n" or char == "\r":
-            # 개행문자이면 False 반환
-            return False
-        cat = unicodedata.category(char)
-        if cat.startswith("C"):
-            # unicode category가
-            # Cc(Control)
-            # Cf(format)
-            # Co(Private Use, is 0)
-            # Cs(Surrrogate, is 0)일 경우, True 반환
-            # https://en.wikipedia.org/wiki/Control_character
-            return True
-        # 이 외의 경우 전부 False 반환
-        return False
-
-    @staticmethod
-    def _is_punctuation(char):
-        # 한국어 형태소 분석기이기 때문에 공백과 같은지 여부만 반환
-        return char == ' '
-
-class WordpieceTokenizer:
-
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-        """
-        text = convert_to_unicode(text)
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                # max word로 설정한 글자 수를 넘길 경우, UNK 처리
-                output_tokens.append(self.unk_token)
-                continue
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                # 첫번째 글자부터 천천히 vocab에 있는 단어인지 체크
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    # do_lower_case == True일 경우에
-                    # 위에서 Canonical Decomposition 과정을 거쳤기 때문에
-                    # 이를 다시 Composition해줘야 vocab의 단어와 비교 가능하다.
-                    substr = unicodedata.normalize("NFC", substr)
-                    if substr in self.vocab:
-                        # 만일 해당 단어가 vocab에 있다면 해당 단어로 break
-                        cur_substr = substr
-                        break
-                    end -= 1
-                # 만일 어떠한 단어랑도 매칭되지 않았다면, (1)로 가서 [UNK] 처리
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                # 어미, 혹은 다른 사전에 있는 단어를 찾기위해 start에 end값을 할당
-                start = end
-            if is_bad: # --- (1)
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-# text 단위 공백 처리
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip() # 양 사이드의 공백을 제거
-    if not text: # 어떠한 값도 없을 시, 빈 list를 반환
-        return []
-    tokens = text.split() # 공백 단위로 쪼갠 list를 반환
-    return tokens
diff --git a/torch_bert/README.md b/torch_bert/README.md
deleted file mode 100644
index 663bfe5..0000000
--- a/torch_bert/README.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# ETRI Pytorch version BERT code
-
-#### 20.04.20 (월)
-- `huggingface.tokenizers`는 `Rust`로 작성
-- 때문에 Etri에서 제공한 Wordpiece Tokenizer는 직접 구현한 것으로 추정됨
-- 아니면 rust code를 python으로 포팅했거나
-- 혹은 tensorflow에서 사용한 version의 코드이거나
-- 아니네 이미 있네! fast version이냐 python이냐 차인가? 살펴보자
-- 추가적인 코드 작성할 필요 있음!! 없는 token!
-- GeLU 구현 중 ERF 함수
-    $\mathrm{erf}(x) = \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt$
diff --git a/torch_bert/__init__.py b/torch_bert/__init__.py
deleted file mode 100644
index 57825ab..0000000
--- a/torch_bert/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-__all__ = ['tokenization_bert', 'configuration_bert', 'tokenization_utils']
diff --git a/torch_bert/__pycache__/file_utils.cpython-36.pyc b/torch_bert/__pycache__/file_utils.cpython-36.pyc
deleted file mode 100644
index 4c30c04..0000000
Binary files a/torch_bert/__pycache__/file_utils.cpython-36.pyc and /dev/null differ
diff --git a/torch_bert/__pycache__/tokenization_bert.cpython-36.pyc b/torch_bert/__pycache__/tokenization_bert.cpython-36.pyc
deleted file mode 100644
index 7ea8396..0000000
Binary files a/torch_bert/__pycache__/tokenization_bert.cpython-36.pyc and /dev/null differ
diff --git a/torch_bert/__pycache__/tokenization_utils.cpython-36.pyc b/torch_bert/__pycache__/tokenization_utils.cpython-36.pyc
deleted file mode 100644
index 7fc7993..0000000
Binary files a/torch_bert/__pycache__/tokenization_utils.cpython-36.pyc and /dev/null differ
diff --git a/torch_bert/activations.py b/torch_bert/activations.py
deleted file mode 100644
index b95fc05..0000000
--- a/torch_bert/activations.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# https://subinium.github.io/introduction-to-activation/
-
-import logging
-import math
-
-import torch
-import torch.nn.functional as F
-
-logger = logging.getLogger(__name__)
-
-
-def swish(x):
-    """https://arxiv.org/pdf/1710.05941v1.pdf"""
-    return x * torch.sigmoid(x)
-
-
-def _gelu_python(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        This is now written in C in torch.nn.functional
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    # torch.erf(input, out=None) -> Tensor
-    # Computes the error function of each element.
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def gelu_new(x):
-    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-if torch.__version__ < "1.4.0":
-    gelu = _gelu_python
-else:
-    gelu = F.gelu # 얘가 제일 빠름!
-    try:
-        import torch_xla
-
-        logger.warning(
-            "The torch_xla package was detected in the python environment. PyTorch/XLA and JIT is untested,"
-            " no activation function will be traced with JIT."
-        )
-    except ImportError:
-        gelu_new = torch.jit.script(gelu_new)
-
-ACT2FN = {
-    'relu': F.relu,
-    'swish': swish,
-    'gelu': gelu,
-    'tanh': torch.tanh,
-    'gelu_new': gelu_new
-}
-
-
-def get_activation(activation_string):
-    activation = ACT2FN.get(activation_string, None)
-    if activation is None:
-        raise KeyError(f"function {activation_string} not found "
-                        "in ACT2FN mapping {list(ACT2FN.keys())}")
-    return activation
diff --git a/torch_bert/configuration_bert.py b/torch_bert/configuration_bert.py
deleted file mode 100644
index 51e89a8..0000000
--- a/torch_bert/configuration_bert.py
+++ /dev/null
@@ -1,518 +0,0 @@
-
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" BERT model configuration """
-
-import copy
-import json
-import logging
-import os
-from typing import Dict, Optional, Tuple
-
-from file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
-
-logger = logging.getLogger(__name__)
-
-BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
-    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
-    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
-    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
-    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
-    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
-    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
-    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
-    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
-    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
-    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
-    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
-    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
-    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
-    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
-    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
-    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
-    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
-    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
-    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
-    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
-    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json",
-}
-
-class PretrainedConfig(object):
-    r""" Base class for all configuration classes.
-        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
-        Note:
-            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
-            It only affects the model's configuration.
-        Class attributes (overridden by derived classes):
-            - ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
-            - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`.
-        Args:
-            finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`):
-                Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
-            num_labels (:obj:`int`, `optional`, defaults to `2`):
-                Number of classes to use when the model is a classification model (sequences/tokens)
-            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Should the model returns attentions weights.
-            output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`):
-                Should the model returns all hidden-states.
-            torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Is the model used with Torchscript (for PyTorch models).
-    """
-    pretrained_config_archive_map = {}  # type: Dict[str, str]
-    model_type = ""  # type: str
-
-    def __init__(self, **kwargs):
-        # Attributes with defaults
-        self.output_attentions = kwargs.pop("output_attentions", False)
-        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
-        self.use_cache = kwargs.pop("use_cache", True)  # Not used by all models
-        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
-        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
-        self.pruned_heads = kwargs.pop("pruned_heads", {})
-
-        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
-        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
-        self.is_decoder = kwargs.pop("is_decoder", False)
-
-        # Parameters for sequence generation
-        self.max_length = kwargs.pop("max_length", 20)
-        self.min_length = kwargs.pop("min_length", 0)
-        self.do_sample = kwargs.pop("do_sample", False)
-        self.early_stopping = kwargs.pop("early_stopping", False)
-        self.num_beams = kwargs.pop("num_beams", 1)
-        self.temperature = kwargs.pop("temperature", 1.0)
-        self.top_k = kwargs.pop("top_k", 50)
-        self.top_p = kwargs.pop("top_p", 1.0)
-        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
-        self.length_penalty = kwargs.pop("length_penalty", 1.0)
-        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
-        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
-        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
-
-        # Fine-tuning task arguments
-        self.architectures = kwargs.pop("architectures", None)
-        self.finetuning_task = kwargs.pop("finetuning_task", None)
-        self.num_labels = kwargs.pop("num_labels", 2)
-        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
-        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
-        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
-
-        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
-        self.prefix = kwargs.pop("prefix", None)
-        self.bos_token_id = kwargs.pop("bos_token_id", None)
-        self.pad_token_id = kwargs.pop("pad_token_id", None)
-        self.eos_token_id = kwargs.pop("eos_token_id", None)
-        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
-
-        # task specific arguments
-        self.task_specific_params = kwargs.pop("task_specific_params", None)
-
-        # TPU arguments
-        self.xla_device = kwargs.pop("xla_device", None)
-
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                logger.error("Can't set {} with value {} for {}".format(key, value, self))
-                raise err
-
-    @property
-    def num_labels(self):
-        return self._num_labels
-
-    @num_labels.setter
-    def num_labels(self, num_labels):
-        self._num_labels = num_labels
-        self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)}
-        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-        self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
-        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
-
-    def save_pretrained(self, save_directory):
-        """
-        Save a configuration object to the directory `save_directory`, so that it
-        can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
-        Args:
-            save_directory (:obj:`string`):
-                Directory where the configuration JSON file will be saved.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_config_file = os.path.join(save_directory, CONFIG_NAME)
-
-        self.to_json_file(output_config_file, use_diff=True)
-        logger.info("Configuration saved in {}".format(output_config_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
-        r"""
-        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
-        Args:
-            pretrained_model_name_or_path (:obj:`string`):
-                either:
-                  - a string with the `shortcut name` of a pre-trained model configuration to load from cache or
-                    download, e.g.: ``bert-base-uncased``.
-                  - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to
-                    our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                  - a path to a `directory` containing a configuration file saved using the
-                    :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                  - a path or url to a saved configuration JSON `file`, e.g.:
-                    ``./my_model_directory/configuration.json``.
-            cache_dir (:obj:`string`, `optional`):
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            kwargs (:obj:`Dict[str, any]`, `optional`):
-                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
-                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
-                controlled by the `return_unused_kwargs` keyword parameter.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-            proxies (:obj:`Dict`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.:
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
-                The proxies are used on each request.
-            return_unused_kwargs: (`optional`) bool:
-                If False, then this function returns just the final configuration object.
-                If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a
-                dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part
-                of kwargs which has not been used to update `config` and is otherwise ignored.
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-        Examples::
-            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
-            # derived class: BertConfig
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            assert config.output_attention == True
-            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attention == True
-            assert unused_kwargs == {'foo': False}
-        """
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        return cls.from_dict(config_dict, **kwargs)
-
-    @classmethod
-    def get_config_dict(
-        cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs
-    ) -> Tuple[Dict, Dict]:
-        """
-        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
-        for instantiating a Config using `from_dict`.
-        Parameters:
-            pretrained_model_name_or_path (:obj:`string`):
-                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
-            pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict:
-                A map of `shortcut names` to `url`. By default, will use the current class attribute.
-        Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-
-        if pretrained_config_archive_map is None:
-            pretrained_config_archive_map = cls.pretrained_config_archive_map
-
-        if pretrained_model_name_or_path in pretrained_config_archive_map:
-            config_file = pretrained_config_archive_map[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        else:
-            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_config_file = cached_path(
-                config_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-            )
-            # Load config dict
-            if resolved_config_file is None:
-                raise EnvironmentError
-            config_dict = cls._dict_from_json_file(resolved_config_file)
-
-        except EnvironmentError:
-            if pretrained_model_name_or_path in pretrained_config_archive_map:
-                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                    config_file
-                )
-            else:
-                msg = (
-                    "Can't load '{}'. Make sure that:\n\n"
-                    "- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-                    "- or '{}' is the correct path to a directory containing a '{}' file\n\n".format(
-                        pretrained_model_name_or_path,
-                        pretrained_model_name_or_path,
-                        pretrained_model_name_or_path,
-                        CONFIG_NAME,
-                    )
-                )
-            raise EnvironmentError(msg)
-
-        except json.JSONDecodeError:
-            msg = (
-                "Couldn't reach server at '{}' to download configuration file or "
-                "configuration file is not a valid JSON file. "
-                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
-            )
-            raise EnvironmentError(msg)
-
-        if resolved_config_file == config_file:
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
-
-        return config_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig":
-        """
-        Constructs a `Config` from a Python dictionary of parameters.
-        Args:
-            config_dict (:obj:`Dict[str, any]`):
-                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
-                from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
-                method.
-            kwargs (:obj:`Dict[str, any]`):
-                Additional parameters from which to initialize the configuration object.
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-        """
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        config = cls(**config_dict)
-
-        if hasattr(config, "pruned_heads"):
-            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        logger.info("Model config %s", str(config))
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    @classmethod
-    def from_json_file(cls, json_file: str) -> "PretrainedConfig":
-        """
-        Constructs a `Config` from the path to a json file of parameters.
-        Args:
-            json_file (:obj:`string`):
-                Path to the JSON file containing the parameters.
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-        """
-        config_dict = cls._dict_from_json_file(json_file)
-        return cls(**config_dict)
-
-    @classmethod
-    def _dict_from_json_file(cls, json_file: str):
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return json.loads(text)
-
-    def __eq__(self, other):
-        return self.__dict__ == other.__dict__
-
-    def __repr__(self):
-        return "{} {}".format(self.__class__.__name__, self.to_json_string())
-
-    def to_diff_dict(self):
-        """
-        Removes all attributes from config which correspond to the default
-        config attributes for better readability and serializes to a Python
-        dictionary.
-        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        config_dict = self.to_dict()
-
-        # get the default config dict
-        default_config_dict = PretrainedConfig().to_dict()
-
-        serializable_config_dict = {}
-
-        # only serialize values that differ from the default config
-        for key, value in config_dict.items():
-            if key not in default_config_dict or value != default_config_dict[key]:
-                serializable_config_dict[key] = value
-
-        return serializable_config_dict
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary.
-        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        if hasattr(self.__class__, "model_type"):
-            output["model_type"] = self.__class__.model_type
-        return output
-
-    def to_json_string(self, use_diff=True):
-        """
-        Serializes this instance to a JSON string.
-        Args:
-            use_diff (:obj:`bool`):
-                If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON string.
-        Returns:
-            :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        if use_diff is True:
-            config_dict = self.to_diff_dict()
-        else:
-            config_dict = self.to_dict()
-        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path, use_diff=True):
-        """
-        Save this instance to a json file.
-        Args:
-            json_file_path (:obj:`string`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-            use_diff (:obj:`bool`):
-                If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON file.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string(use_diff=use_diff))
-
-    def update(self, config_dict: Dict):
-        """
-        Updates attributes of this class
-        with attributes from `config_dict`.
-        Args:
-            :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class.
-        """
-        for key, value in config_dict.items():
-            setattr(self, key, value)
-
-
-class BertConfig(PretrainedConfig):
-    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        It is used to instantiate an BERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-        Example::
-            from transformers import BertModel, BertConfig
-            # Initializing a BERT bert-base-uncased style configuration
-            configuration = BertConfig()
-            # Initializing a model from the bert-base-uncased style configuration
-            model = BertModel(configuration)
-            # Accessing the model configuration
-            configuration = model.config
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "bert"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-
-
-if __name__ == '__main__':
-    bergconfig = BertConfig()
diff --git a/torch_bert/file_utils.py b/torch_bert/file_utils.py
deleted file mode 100644
index 159e81a..0000000
--- a/torch_bert/file_utils.py
+++ /dev/null
@@ -1,496 +0,0 @@
-"""
-Utilities for working with the local dataset cache.
-This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
-Copyright by the AllenNLP authors.
-"""
-
-import fnmatch
-import json
-import logging
-import os
-import shutil
-import sys
-import tarfile
-import tempfile
-from contextlib import contextmanager
-from functools import partial, wraps
-from hashlib import sha256
-from typing import Optional
-from urllib.parse import urlparse
-from zipfile import ZipFile, is_zipfile
-
-import boto3
-import requests
-from botocore.config import Config
-from botocore.exceptions import ClientError
-from filelock import FileLock
-from tqdm.auto import tqdm
-
-# from . import __version__
-
-
-logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
-
-try:
-    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-    if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"):
-        import torch
-
-        _torch_available = True  # pylint: disable=invalid-name
-        logger.info("PyTorch version {} available.".format(torch.__version__))
-    else:
-        logger.info("Disabling PyTorch because USE_TF is set")
-        _torch_available = False
-except ImportError:
-    _torch_available = False  # pylint: disable=invalid-name
-
-# TensorFlow 안써용
-# try:
-#     USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-#     USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-#
-#     if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"):
-#         import tensorflow as tf
-#
-#         assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
-#         _tf_available = True  # pylint: disable=invalid-name
-#         logger.info("TensorFlow version {} available.".format(tf.__version__))
-#     else:
-#         logger.info("Disabling Tensorflow because USE_TORCH is set")
-#         _tf_available = False
-# except (ImportError, AssertionError):
-#     _tf_available = False  # pylint: disable=invalid-name
-
-try:
-    from torch.hub import _get_torch_home
-
-    torch_cache_home = _get_torch_home()
-except ImportError:
-    torch_cache_home = os.path.expanduser(
-        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-    )
-default_cache_path = os.path.join(torch_cache_home, "transformers")
-
-try:
-    from pathlib import Path
-
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
-    )
-except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
-        "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-    )
-
-PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
-TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
-
-WEIGHTS_NAME = "pytorch_model.bin"
-TF2_WEIGHTS_NAME = "tf_model.h5"
-TF_WEIGHTS_NAME = "model.ckpt"
-CONFIG_NAME = "config.json"
-MODEL_CARD_NAME = "modelcard.json"
-
-
-MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]]
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
-
-S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
-CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
-
-
-def is_torch_available():
-    return _torch_available
-
-
-def is_tf_available():
-    return _tf_available
-
-
-def add_start_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-def add_start_docstrings_to_callable(*docstr):
-    def docstring_decorator(fn):
-        class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
-        intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
-        note = r"""
-    .. note::
-        Although the recipe for forward pass needs to be defined within
-        this function, one should call the :class:`Module` instance afterwards
-        instead of this since the former takes care of running the
-        pre and post processing steps while the latter silently ignores them.
-        """
-        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
-
-    return docstring_decorator
-
-
-def add_end_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + "".join(docstr)
-        return fn
-
-    return docstring_decorator
-
-
-def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https", "s3")
-
-
-def hf_bucket_url(identifier, postfix=None, cdn=False) -> str:
-    endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
-    if postfix is None:
-        return "/".join((endpoint, identifier))
-    else:
-        return "/".join((endpoint, identifier, postfix))
-
-
-def url_to_filename(url, etag=None):
-    """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the url's, delimited
-    by a period.
-    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
-    so that TF 2.0 can identify it as a HDF5 file
-    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    """
-    url_bytes = url.encode("utf-8")
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
-
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        etag_hash = sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
-
-    if url.endswith(".h5"):
-        filename += ".h5"
-
-    return filename
-
-
-def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
-
-    meta_path = cache_path + ".json"
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
-
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata["url"]
-    etag = metadata["etag"]
-
-    return url, etag
-
-
-def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent=None,
-    extract_compressed_file=False,
-    force_extract=False,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    Args:
-        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletly recieved file is found.
-        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
-        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
-            file in a folder along the archive.
-        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
-            re-extract the archive and overide the folder where it was extracted.
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-            local_files_only=local_files_only,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        output_path = url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-    if extract_compressed_file:
-        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
-        output_dir, output_file = os.path.split(output_path)
-        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-
-        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted)
-            if is_zipfile(output_path):
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
-
-        return output_path_extracted
-
-    return output_path
-
-
-def split_s3_path(url):
-    """Split a full s3 path into the bucket name and path."""
-    parsed = urlparse(url)
-    if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
-    bucket_name = parsed.netloc
-    s3_path = parsed.path
-    # Remove '/' at beginning of path.
-    if s3_path.startswith("/"):
-        s3_path = s3_path[1:]
-    return bucket_name, s3_path
-
-
-def s3_request(func):
-    """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
-
-    @wraps(func)
-    def wrapper(url, *args, **kwargs):
-        try:
-            return func(url, *args, **kwargs)
-        except ClientError as exc:
-            if int(exc.response["Error"]["Code"]) == 404:
-                raise EnvironmentError("file {} not found".format(url))
-            else:
-                raise
-
-    return wrapper
-
-
-@s3_request
-def s3_etag(url, proxies=None):
-    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
-    bucket_name, s3_path = split_s3_path(url)
-    s3_object = s3_resource.Object(bucket_name, s3_path)
-    return s3_object.e_tag
-
-
-@s3_request
-def s3_get(url, temp_file, proxies=None):
-    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
-    bucket_name, s3_path = split_s3_path(url)
-    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
-
-
-def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
-    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
-    if is_torch_available():
-        ua += "; torch/{}".format(torch.__version__)
-    if is_tf_available():
-        ua += "; tensorflow/{}".format(tf.__version__)
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    headers = {"user-agent": ua}
-    if resume_size > 0:
-        headers["Range"] = "bytes=%d-" % (resume_size,)
-    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    if response.status_code == 416:  # Range not satisfiable
-        return
-    content_length = response.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        total=total,
-        initial=resume_size,
-        desc="Downloading",
-        disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
-    )
-    for chunk in response.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-
-
-def get_from_cache(
-    url,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent=None,
-    local_files_only=False,
-) -> Optional[str]:
-    """
-    Given a URL, look for the corresponding file in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    etag = None
-    if not local_files_only:
-        # Get eTag to add to filename, if it exists.
-        if url.startswith("s3://"):
-            etag = s3_etag(url, proxies=proxies)
-        else:
-            try:
-                response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
-                if response.status_code == 200:
-                    etag = response.headers.get("ETag")
-            except (EnvironmentError, requests.exceptions.Timeout):
-                # etag is already None
-                pass
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                # If files cannot be found and local_files_only=True,
-                # the models might've been found if local_files_only=False
-                # Notify the user about that
-                if local_files_only:
-                    raise ValueError(
-                        "Cannot find the requested files in the cached path and outgoing traffic has been"
-                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-                        " to False."
-                    )
-                return None
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-
-        if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "a+b") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                if resume_download:
-                    logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
-                s3_get(url, temp_file, proxies=proxies)
-            else:
-                http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
-
-        logger.info("storing %s in cache at %s", url, cache_path)
-        os.replace(temp_file.name, cache_path)
-
-        logger.info("creating metadata file for %s", cache_path)
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-
-    return cache_path
diff --git a/torch_bert/modeling_bert.py b/torch_bert/modeling_bert.py
deleted file mode 100644
index 4a74b72..0000000
--- a/torch_bert/modeling_bert.py
+++ /dev/null
@@ -1,1369 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch BERT model. """
-
-
-import logging
-import math
-import os
-
-import torch
-from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from .activations import gelu, gelu_new, swish
-from .configuration_bert import BertConfig
-from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
-from .modeling_utils import PreTrainedModel, prune_linear_layer
-
-
-logger = logging.getLogger(__name__)
-
-BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
-    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
-    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
-    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
-    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
-    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
-    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
-    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin",
-    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
-    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
-    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
-    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
-    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
-    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
-    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
-    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
-    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
-    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
-    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
-    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/pytorch_model.bin",
-}
-
-
-def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model.
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        logger.error(
-            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions."
-        )
-        raise
-    tf_path = os.path.abspath(tf_checkpoint_path)
-    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        logger.info("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array)
-
-    for name, array in zip(names, arrays):
-        name = name.split("/")
-        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
-        # which are not required for using pretrained model
-        if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
-            for n in name
-        ):
-            logger.info("Skipping {}".format("/".join(name)))
-            continue
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
-                scope_names = re.split(r"_(\d+)", m_name)
-            else:
-                scope_names = [m_name]
-            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
-                pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
-            else:
-                try:
-                    pointer = getattr(pointer, scope_names[0])
-                except AttributeError:
-                    logger.info("Skipping {}".format("/".join(name)))
-                    continue
-            if len(scope_names) >= 2:
-                num = int(scope_names[1])
-                pointer = pointer[num]
-        if m_name[-11:] == "_embeddings":
-            pointer = getattr(pointer, "weight")
-        elif m_name == "kernel":
-            array = np.transpose(array)
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def mish(x):
-    return x * torch.tanh(nn.functional.softplus(x))
-
-
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
-
-
-BertLayerNorm = torch.nn.LayerNorm
-
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        if position_ids is None:
-            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0).expand(input_shape)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
-            )
-        self.output_attentions = config.output_attentions
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        if encoder_hidden_states is not None:
-            mixed_key_layer = self.key(encoder_hidden_states)
-            mixed_value_layer = self.value(encoder_hidden_states)
-            attention_mask = encoder_attention_mask
-        else:
-            mixed_key_layer = self.key(hidden_states)
-            mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
-        return outputs
-
-
-class BertSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
-        heads = set(heads) - self.pruned_heads  # Convert to set and remove already pruned heads
-        for head in heads:
-            # Compute how many pruned heads are before the head and move the index accordingly
-            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        self_outputs = self.self(
-            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class BertIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = BertAttention(config)
-        self.is_decoder = config.is_decoder
-        if self.is_decoder:
-            self.crossattention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
-        attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        if self.is_decoder and encoder_hidden_states is not None:
-            cross_attention_outputs = self.crossattention(
-                attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
-
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output,) + outputs
-        return outputs
-
-
-class BertEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        all_hidden_states = ()
-        all_attentions = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(
-                hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask
-            )
-            hidden_states = layer_outputs[0]
-
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertPredictionHeadTransform(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-
-class BertLMPredictionHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.transform = BertPredictionHeadTransform(config)
-
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
-
-        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
-        self.decoder.bias = self.bias
-
-    def forward(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class BertOnlyMLMHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-
-    def forward(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = BertLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-
-class BertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = BertConfig
-    pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_bert
-    base_model_prefix = "bert"
-
-    def _init_weights(self, module):
-        """ Initialize the weights """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-BERT_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
-    Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
-"""
-
-BERT_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`transformers.BertTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
-            Mask to nullify selected heads of the self-attention modules.
-            Mask values selected in ``[0, 1]``:
-            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-"""
-
-
-@add_start_docstrings(
-    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
-)
-class BertModel(BertPreTrainedModel):
-    """
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-    To behave as an decoder the model needs to be initialized with the
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
-    .. _`Attention is all you need`:
-        https://arxiv.org/abs/1706.03762
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-    ):
-        r"""
-    Return:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during pre-training.
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    Examples::
-        from transformers import BertModel, BertTokenizer
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-        """
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if attention_mask is None:
-            attention_mask = torch.ones(input_shape, device=device)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
-            attention_mask, input_shape, self.device
-        )
-
-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
-            1:
-        ]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
-    a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING,
-)
-class BertForPreTraining(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-        next_sentence_label=None,
-    ):
-        r"""
-        masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False
-            continuation before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForPreTraining
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForPreTraining.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[
-            2:
-        ]  # add hidden states and attention if they are here
-
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            total_loss = masked_lm_loss + next_sentence_loss
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
-
-
-@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
-class BertForMaskedLM(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        masked_lm_labels=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        lm_labels=None,
-    ):
-        r"""
-        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-        lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the left-to-right language modeling loss (next word prediction).
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Masked language modeling loss.
-        ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided):
-                Next token prediction loss.
-        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        Examples::
-            from transformers import BertTokenizer, BertForMaskedLM
-            import torch
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-            outputs = model(input_ids, masked_lm_labels=input_ids)
-            loss, prediction_scores = outputs[:2]
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
-
-        # Although this may seem awkward, BertForMaskedLM supports two scenarios:
-        # 1. If a tensor that contains the indices of masked labels is provided,
-        #    the cross-entropy is the MLM cross-entropy that measures the likelihood
-        #    of predictions for masked words.
-        # 2. If `lm_labels` is provided we are in a causal scenario where we
-        #    try to predict the next token for each input in the decoder.
-        if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
-            outputs = (masked_lm_loss,) + outputs
-
-        if lm_labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            prediction_scores = prediction_scores[:, :-1, :].contiguous()
-            lm_labels = lm_labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
-            outputs = (ltr_lm_loss,) + outputs
-
-        return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
-)
-class BertForNextSentencePrediction(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        next_sentence_label=None,
-    ):
-        r"""
-        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
-            Indices should be in ``[0, 1]``.
-            ``0`` indicates sequence B is a continuation of sequence A,
-            ``1`` indicates sequence B is a random sequence.
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
-            Next sequence prediction (classification) loss.
-        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
-            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForNextSentencePrediction
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        seq_relationship_scores = outputs[0]
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        seq_relationship_score = self.cls(pooled_output)
-
-        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
-        if next_sentence_label is not None:
-            loss_fct = CrossEntropyLoss()
-            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
-            outputs = (next_sentence_loss,) + outputs
-
-        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING,
-)
-class BertForSequenceClassification(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForSequenceClassification
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING,
-)
-class BertForMultipleChoice(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
-            of the input tensors. (see `input_ids` above)
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Classification loss.
-        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
-            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForMultipleChoice
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
-        """
-        num_choices = input_ids.shape[1]
-
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING,
-)
-class BertForTokenClassification(BertPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
-            Classification loss.
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
-            Classification scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForTokenClassification
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING,
-)
-class BertForQuestionAnswering(BertPreTrainedModel):
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-    Returns:
-        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
-            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
-        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-start scores (before SoftMax).
-        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
-            Span-end scores (before SoftMax).
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
-            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    Examples::
-        from transformers import BertTokenizer, BertForQuestionAnswering
-        import torch
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
-        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-        encoding = tokenizer.encode_plus(question, text)
-        input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"]
-        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
-        assert answer == "a nice puppet"
-        """
-
-        outputs = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/torch_bert/modeling_utils.py b/torch_bert/modeling_utils.py
deleted file mode 100644
index d88e2a7..0000000
--- a/torch_bert/modeling_utils.py
+++ /dev/null
@@ -1,2022 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# 밑흰 2000줄짜리 코드를 언제 다 뜯어보고있누 ^^
-# 깔깔깔
-#
-"""PyTorch BERT model."""
-
-import logging
-import os
-from typing import Callable, Tuple
-
-import torch
-from torch import Tensor, device, dtype, nn
-from torch.nn import CrossEntropyLoss
-from torch.nn import functional as F
-
-from activations import get_activation
-from configuration_bert import PretrainedConfig
-from file_utils import (
-    DUMMY_INPUTS,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    cached_path,
-    hf_bucket_url,
-    is_remote_url,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-try:
-    from torch.nn import Identity
-except ImportError:
-    # Older PyTorch compatibility
-    class Identity(nn.Module):
-        r"""A placeholder identity operator that is argument-insensitive.
-        """
-
-        def __init__(self, *args, **kwargs):
-            super().__init__()
-
-        def forward(self, input):
-            return input
-
-
-class ModuleUtilsMixin:
-    """
-    A few utilities for torch.nn.Modules, to be used as a mixin.
-    """
-
-    def num_parameters(self, only_trainable: bool = False) -> int:
-        """
-        Get number of (optionally, trainable) parameters in the module.
-        """
-        params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters()
-        return sum(p.numel() for p in params)
-
-    @staticmethod
-    def _hook_rss_memory_pre_forward(module, *args, **kwargs):
-        try:
-            import psutil
-        except (ImportError):
-            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
-
-        process = psutil.Process(os.getpid())
-        mem = process.memory_info()
-        module.mem_rss_pre_forward = mem.rss
-        return None
-
-    @staticmethod
-    def _hook_rss_memory_post_forward(module, *args, **kwargs):
-        try:
-            import psutil
-        except (ImportError):
-            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
-
-        process = psutil.Process(os.getpid())
-        mem = process.memory_info()
-        module.mem_rss_post_forward = mem.rss
-        mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
-        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0)
-        return None
-
-    def add_memory_hooks(self):
-        """ Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
-            Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()`
-        """
-        for module in self.modules():
-            module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
-            module.register_forward_hook(self._hook_rss_memory_post_forward)
-        self.reset_memory_hooks_state()
-
-    def reset_memory_hooks_state(self):
-        for module in self.modules():
-            module.mem_rss_diff = 0
-            module.mem_rss_post_forward = 0
-            module.mem_rss_pre_forward = 0
-
-    @property
-    def device(self) -> device:
-        return next(self.parameters()).device
-
-    @property
-    def dtype(self) -> dtype:
-        return next(self.parameters()).dtype
-
-    def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
-        """type: torch.Tensor -> torch.Tensor"""
-        if encoder_attention_mask.dim() == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
-        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
-        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
-        # /transformer/transformer_layers.py#L270
-        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
-        # encoder_extended_attention_mask.transpose(-1, -2))
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
-        return encoder_extended_attention_mask
-
-    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: tuple, device: device):
-        """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored.
-        Arguments:
-            attention_mask: torch.Tensor with 1 indicating tokens to ATTEND to
-            input_shape: tuple, shape of input_ids
-            device: torch.Device, usually self.device
-        Returns:
-            torch.Tensor with dtype of attention_mask.dtype
-        """
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        if attention_mask.dim() == 3:
-            extended_attention_mask = attention_mask[:, None, :, :]
-        elif attention_mask.dim() == 2:
-            # Provided a padding mask of dimensions [batch_size, seq_length]
-            # - if the model is a decoder, apply a causal mask in addition to the padding mask
-            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-            if self.config.is_decoder:
-                batch_size, seq_length = input_shape
-                seq_ids = torch.arange(seq_length, device=device)
-                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                # causal and attention masks must have same type with pytorch version < 1.3
-                causal_mask = causal_mask.to(attention_mask.dtype)
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
-            else:
-                extended_attention_mask = attention_mask[:, None, None, :]
-        else:
-            raise ValueError(
-                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
-                    input_shape, attention_mask.shape
-                )
-            )
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-        return extended_attention_mask
-
-    def get_head_mask(self, head_mask, num_hidden_layers):
-        """
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        attention_probs has shape bsz x n_heads x N x N
-        Arguments:
-            head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads]
-            num_hidden_layers: int
-        Returns:
-             Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-             or list with [None] for each layer
-        """
-        if head_mask is not None:
-            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
-        else:
-            head_mask = [None] * num_hidden_layers
-
-        return head_mask
-
-    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
-        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
-        if head_mask.dim() == 1:
-            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
-        elif head_mask.dim() == 2:
-            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
-        head_mask = head_mask.to(dtype=self.dtype)  # switch to fload if need + fp16 compatibility
-        return head_mask
-
-
-class PreTrainedModel(nn.Module, ModuleUtilsMixin):
-    r""" Base class for all models.
-        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
-        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
-        Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
-            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
-                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
-                - ``path``: a path (string) to the TensorFlow checkpoint.
-            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
-    """
-    config_class = None
-    pretrained_model_archive_map = {}
-    base_model_prefix = ""
-
-    @property
-    def dummy_inputs(self):
-        """ Dummy inputs to do a forward pass in the network.
-        Returns:
-            torch.Tensor with dummy inputs
-        """
-        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__()
-        if not isinstance(config, PretrainedConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        # Save config in model
-        self.config = config
-
-    @property
-    def base_model(self):
-        return getattr(self, self.base_model_prefix, self)
-
-    def get_input_embeddings(self):
-        """
-        Returns the model's input embeddings.
-        Returns:
-            :obj:`nn.Module`:
-                A torch module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            return base_model.get_input_embeddings()
-        else:
-            raise NotImplementedError
-
-    def set_input_embeddings(self, value):
-        """
-        Set model's input embeddings
-        Args:
-            value (:obj:`nn.Module`):
-                A module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            base_model.set_input_embeddings(value)
-        else:
-            raise NotImplementedError
-
-    def get_output_embeddings(self):
-        """
-        Returns the model's output embeddings.
-        Returns:
-            :obj:`nn.Module`:
-                A torch module mapping hidden states to vocabulary.
-        """
-        return None  # Overwrite for models with output embeddings
-
-    def tie_weights(self):
-        """
-        Tie the weights between the input embeddings and the output embeddings.
-        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
-        the weights instead.
-        """
-        output_embeddings = self.get_output_embeddings()
-        if output_embeddings is not None:
-            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
-
-    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
-        """ Tie or clone module weights depending of weither we are using TorchScript or not
-        """
-        if self.config.torchscript:
-            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
-        else:
-            output_embeddings.weight = input_embeddings.weight
-
-        if getattr(output_embeddings, "bias", None) is not None:
-            output_embeddings.bias.data = torch.nn.functional.pad(
-                output_embeddings.bias.data,
-                (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],),
-                "constant",
-                0,
-            )
-        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
-            output_embeddings.out_features = input_embeddings.num_embeddings
-
-    def resize_token_embeddings(self, new_num_tokens=None):
-        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
-        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
-        Arguments:
-            new_num_tokens: (`optional`) int:
-                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
-                If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
-        Return: ``torch.nn.Embeddings``
-            Pointer to the input tokens Embeddings Module of the model
-        """
-        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
-        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
-        if new_num_tokens is None:
-            return model_embeds
-
-        # Update base model and current model config
-        self.config.vocab_size = new_num_tokens
-        base_model.vocab_size = new_num_tokens
-
-        # Tie weights again if needed
-        self.tie_weights()
-
-        return model_embeds
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.get_input_embeddings()
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
-        self.set_input_embeddings(new_embeddings)
-        return self.get_input_embeddings()
-
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        """ Build a resized Embedding Module from a provided token Embedding Module.
-            Increasing the size will add newly initialized vectors at the end
-            Reducing the size will remove vectors from the end
-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: return the provided token Embedding Module.
-        Return: ``torch.nn.Embeddings``
-            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
-        """
-        if new_num_tokens is None:
-            return old_embeddings
-
-        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        if old_num_tokens == new_num_tokens:
-            return old_embeddings
-
-        # Build new embeddings
-        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        new_embeddings.to(old_embeddings.weight.device)
-
-        # initialize all new embeddings (in particular added tokens)
-        self._init_weights(new_embeddings)
-
-        # Copy token embeddings from the previous weights
-        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
-
-        return new_embeddings
-
-    def init_weights(self):
-        """ Initialize and prunes weights if needed. """
-        # Initialize weights
-        self.apply(self._init_weights)
-
-        # Prune heads if needed
-        if self.config.pruned_heads:
-            self.prune_heads(self.config.pruned_heads)
-
-        # Tie weights if needed
-        self.tie_weights()
-
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the base model.
-            Arguments:
-                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
-                E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
-        """
-        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
-        for layer, heads in heads_to_prune.items():
-            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
-            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
-
-        self.base_model._prune_heads(heads_to_prune)
-
-    def save_pretrained(self, save_directory):
-        """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
-            Arguments:
-                save_directory: directory to which to save.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # Only save the model itself if we are using distributed training
-        model_to_save = self.module if hasattr(self, "module") else self
-
-        # Attach architecture to the config
-        model_to_save.config.architectures = [model_to_save.__class__.__name__]
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
-
-        if getattr(self.config, "xla_device", False):
-            import torch_xla.core.xla_model as xm
-
-            if xm.is_master_ordinal():
-                # Save configuration file
-                model_to_save.config.save_pretrained(save_directory)
-            # xm.save takes care of saving only from master
-            xm.save(model_to_save.state_dict(), output_model_file)
-        else:
-            model_to_save.config.save_pretrained(save_directory)
-            torch.save(model_to_save.state_dict(), output_model_file)
-
-        logger.info("Model weights saved in {}".format(output_model_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
-        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
-        To train the model, you should first set it back in training mode with ``model.train()``
-        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
-        It is up to you to train those weights with a downstream fine-tuning task.
-        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
-        Parameters:
-            pretrained_model_name_or_path: either:
-              - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-              - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-              - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-              - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-              - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) one of:
-                - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
-                - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-                    - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                    - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                    - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-            state_dict: (`optional`) dict:
-                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-        Examples::
-            # For example purposes. Not runnable.
-            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-        """
-        config = kwargs.pop("config", None)
-        state_dict = kwargs.pop("state_dict", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_tf = kwargs.pop("from_tf", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", False)
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                **kwargs,
-            )
-        else:
-            model_kwargs = kwargs
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-            elif os.path.isdir(pretrained_model_name_or_path):
-                if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
-                    # Load from a TF 1.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
-                elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
-                    # Load from a TF 2.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
-                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} or `from_tf` set to False".format(
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
-                            pretrained_model_name_or_path,
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                assert (
-                    from_tf
-                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
-                    pretrained_model_name_or_path + ".index"
-                )
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path, postfix=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME),
-                )
-
-            # redirect to the cache, if necessary
-            try:
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    proxies=proxies,
-                    resume_download=resume_download,
-                    local_files_only=local_files_only,
-                )
-            except EnvironmentError:
-                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    msg = "Couldn't reach server at '{}' to download pretrained weights.".format(archive_file)
-                else:
-                    msg = (
-                        "Model name '{}' was not found in model name list ({}). "
-                        "We assumed '{}' was a path or url to model weight files named one of {} but "
-                        "couldn't find any such file at this path or url.".format(
-                            pretrained_model_name_or_path,
-                            ", ".join(cls.pretrained_model_archive_map.keys()),
-                            archive_file,
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME],
-                        )
-                    )
-                raise EnvironmentError(msg)
-
-            if resolved_archive_file == archive_file:
-                logger.info("loading weights file {}".format(archive_file))
-            else:
-                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
-        else:
-            resolved_archive_file = None
-
-        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
-
-        if state_dict is None and not from_tf:
-            try:
-                state_dict = torch.load(resolved_archive_file, map_location="cpu")
-            except Exception:
-                raise OSError(
-                    "Unable to load weights from pytorch checkpoint file. "
-                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
-                )
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-
-        if from_tf:
-            if resolved_archive_file.endswith(".index"):
-                # Load from a TensorFlow 1.X checkpoint - provided by original authors
-                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
-            else:
-                # Load from our TensorFlow 2.0 checkpoints
-                try:
-                    from transformers import load_tf2_checkpoint_in_pytorch_model
-
-                    model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
-                except ImportError:
-                    logger.error(
-                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
-                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
-                    )
-                    raise
-        else:
-            # Convert old format to new format if needed from a PyTorch state_dict
-            old_keys = []
-            new_keys = []
-            for key in state_dict.keys():
-                new_key = None
-                if "gamma" in key:
-                    new_key = key.replace("gamma", "weight")
-                if "beta" in key:
-                    new_key = key.replace("beta", "bias")
-                if new_key:
-                    old_keys.append(key)
-                    new_keys.append(new_key)
-            for old_key, new_key in zip(old_keys, new_keys):
-                state_dict[new_key] = state_dict.pop(old_key)
-
-            # copy state_dict so _load_from_state_dict can modify it
-            metadata = getattr(state_dict, "_metadata", None)
-            state_dict = state_dict.copy()
-            if metadata is not None:
-                state_dict._metadata = metadata
-
-            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-            # so we need to apply the function recursively.
-            def load(module: nn.Module, prefix=""):
-                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-                module._load_from_state_dict(
-                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs,
-                )
-                for name, child in module._modules.items():
-                    if child is not None:
-                        load(child, prefix + name + ".")
-
-            # Make sure we are able to load base models as well as derived models (with heads)
-            start_prefix = ""
-            model_to_load = model
-            has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
-            if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
-                start_prefix = cls.base_model_prefix + "."
-            if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
-                model_to_load = getattr(model, cls.base_model_prefix)
-
-            load(model_to_load, prefix=start_prefix)
-
-            if model.__class__.__name__ != model_to_load.__class__.__name__:
-                base_model_state_dict = model_to_load.state_dict().keys()
-                head_model_state_dict_without_base_prefix = [
-                    key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
-                ]
-
-                missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
-
-            if len(missing_keys) > 0:
-                logger.info(
-                    "Weights of {} not initialized from pretrained model: {}".format(
-                        model.__class__.__name__, missing_keys
-                    )
-                )
-            if len(unexpected_keys) > 0:
-                logger.info(
-                    "Weights from pretrained model not used in {}: {}".format(
-                        model.__class__.__name__, unexpected_keys
-                    )
-                )
-            if len(error_msgs) > 0:
-                raise RuntimeError(
-                    "Error(s) in loading state_dict for {}:\n\t{}".format(
-                        model.__class__.__name__, "\n\t".join(error_msgs)
-                    )
-                )
-        model.tie_weights()  # make sure token embedding weights are still tied if needed
-
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-
-        if output_loading_info:
-            loading_info = {
-                "missing_keys": missing_keys,
-                "unexpected_keys": unexpected_keys,
-                "error_msgs": error_msgs,
-            }
-            return model, loading_info
-
-        if hasattr(config, "xla_device") and config.xla_device:
-            import torch_xla.core.xla_model as xm
-
-            model = xm.send_cpu_data_to_device(model, xm.xla_device())
-            model = model.to(xm.xla_device())
-
-        return model
-
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        return {"input_ids": input_ids}
-
-    def prepare_scores_for_generation(self, scores, **kwargs):
-        return scores
-
-    def _use_cache(self, outputs, use_cache):
-        """During generation, decide whether to pass the `past` variable to the next forward pass."""
-        if len(outputs) <= 1 or use_cache is False:
-            return False
-        if hasattr(self.config, "mem_len") and self.config.mem_len == 0:
-            return False
-        return True
-
-    def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty):
-        """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
-        for i in range(batch_size * num_beams):
-            for previous_token in set(prev_output_tokens[i].tolist()):
-                # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if lprobs[i, previous_token] < 0:
-                    lprobs[i, previous_token] *= repetition_penalty
-                else:
-                    lprobs[i, previous_token] /= repetition_penalty
-
-    @torch.no_grad()
-    def generate(
-        self,
-        input_ids=None,
-        max_length=None,
-        min_length=None,
-        do_sample=None,
-        early_stopping=None,
-        num_beams=None,
-        temperature=None,
-        top_k=None,
-        top_p=None,
-        repetition_penalty=None,
-        bad_words_ids=None,
-        bos_token_id=None,
-        pad_token_id=None,
-        eos_token_id=None,
-        length_penalty=None,
-        no_repeat_ngram_size=None,
-        num_return_sequences=None,
-        attention_mask=None,
-        decoder_start_token_id=None,
-        use_cache=None,
-    ):
-        r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
-        Adapted in part from `Facebook's XLM beam search code`_.
-        .. _`Facebook's XLM beam search code`:
-           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
-        Parameters:
-            input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)`
-                The sequence used as a prompt for the generation. If `None` the method initializes
-                it as an empty `torch.LongTensor` of shape `(1,)`.
-            max_length: (`optional`) int
-                The max length of the sequence to be generated.  Between `min_length` and infinity. Default to 20.
-            min_length: (`optional`) int
-                The min length of the sequence to be generated.  Between 0 and infinity. Default to 0.
-            do_sample: (`optional`) bool
-                If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
-            early_stopping: (`optional`) bool
-                if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
-            num_beams: (`optional`) int
-                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
-            temperature: (`optional`) float
-                The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
-            top_k: (`optional`) int
-                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
-            top_p: (`optional`) float
-                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
-            repetition_penalty: (`optional`) float
-                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
-            pad_token_id: (`optional`) int
-                Padding token. Default to specicic model pad_token_id or None if it does not exist.
-            bos_token_id: (`optional`) int
-                BOS token. Defaults to `bos_token_id` as defined in the models config.
-            eos_token_id: (`optional`) int
-                EOS token. Defaults to `eos_token_id` as defined in the models config.
-            length_penalty: (`optional`) float
-                Exponential penalty to the length. Default to 1.
-            no_repeat_ngram_size: (`optional`) int
-                If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
-            bad_words_ids: (`optional`) list of lists of int
-                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
-            num_return_sequences: (`optional`) int
-                The number of independently computed returned sequences for each element in the batch. Default to 1.
-            attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids`
-                Mask to avoid performing attention on padding token indices.
-                Mask values selected in ``[0, 1]``:
-                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-                Defaults to `None`.
-            `What are attention masks? <../glossary.html#attention-mask>`__
-            decoder_start_token_id=None: (`optional`) int
-                If an encoder-decoder model starts decoding with a different token than BOS.
-                Defaults to `None` and is changed to `BOS` later.
-            use_cache: (`optional`) bool
-                If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`.
-        Return:
-            output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`
-                sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`
-        Examples::
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            outputs = model.generate(max_length=40)  # do greedy decoding
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
-            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
-            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
-            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
-            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
-        """
-
-        # We cannot generate if the model does not have a LM head
-        if self.get_output_embeddings() is None:
-            raise AttributeError(
-                "You tried to generate sequences with a model that does not have a LM Head."
-                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )"
-            )
-
-        max_length = max_length if max_length is not None else self.config.max_length
-        min_length = min_length if min_length is not None else self.config.min_length
-        do_sample = do_sample if do_sample is not None else self.config.do_sample
-        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        num_beams = num_beams if num_beams is not None else self.config.num_beams
-        temperature = temperature if temperature is not None else self.config.temperature
-        top_k = top_k if top_k is not None else self.config.top_k
-        top_p = top_p if top_p is not None else self.config.top_p
-        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
-        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
-        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
-        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
-        no_repeat_ngram_size = (
-            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
-        )
-        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
-        num_return_sequences = (
-            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
-        )
-        decoder_start_token_id = (
-            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
-        )
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]  # overriden by the input batch_size
-        else:
-            batch_size = 1
-
-        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
-        assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
-        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
-        assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
-        assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
-        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
-        assert temperature > 0, "`temperature` should be strictly positive."
-        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
-        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
-        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
-        assert input_ids is not None or (
-            isinstance(bos_token_id, int) and bos_token_id >= 0
-        ), "If input_ids is not defined, `bos_token_id` should be a positive integer."
-        assert pad_token_id is None or (
-            isinstance(pad_token_id, int) and (pad_token_id >= 0)
-        ), "`pad_token_id` should be a positive integer."
-        assert (eos_token_id is None) or (
-            isinstance(eos_token_id, int) and (eos_token_id >= 0)
-        ), "`eos_token_id` should be a positive integer."
-        assert length_penalty > 0, "`length_penalty` should be strictly positive."
-        assert (
-            isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0
-        ), "`no_repeat_ngram_size` should be a positive integer."
-        assert (
-            isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictly positive integer."
-        assert (
-            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
-        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
-
-        if input_ids is None:
-            assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
-                "you should either supply a context to complete as `input_ids` input "
-                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
-            )
-            input_ids = torch.full(
-                (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device,
-            )
-        else:
-            assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
-
-        # not allow to duplicate outputs when greedy decoding
-        if do_sample is False:
-            if num_beams == 1:
-                # no_beam_search greedy generation conditions
-                assert (
-                    num_return_sequences == 1
-                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
-
-            else:
-                # beam_search greedy generation conditions
-                assert (
-                    num_beams >= num_return_sequences
-                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
-
-        # create attention mask if necessary
-        # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
-        if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids):
-            attention_mask = input_ids.ne(pad_token_id).long()
-        elif attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-
-        # set pad_token_id to eos_token_id if not set. Important that this is done after
-        # attention_mask is created
-        if pad_token_id is None and eos_token_id is not None:
-            logger.warning(
-                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id)
-            )
-            pad_token_id = eos_token_id
-
-        # current position and vocab size
-        vocab_size = self.config.vocab_size
-
-        # set effective batch size and effective batch multiplier according to do_sample
-        if do_sample:
-            effective_batch_size = batch_size * num_return_sequences
-            effective_batch_mult = num_return_sequences
-        else:
-            effective_batch_size = batch_size
-            effective_batch_mult = 1
-
-        if self.config.is_encoder_decoder:
-            if decoder_start_token_id is None:
-                decoder_start_token_id = bos_token_id
-
-            assert (
-                decoder_start_token_id is not None
-            ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
-            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
-            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
-
-            # get encoder and store encoder outputs
-            encoder = self.get_encoder()
-
-            encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask)
-
-        # Expand input ids if num_beams > 1 or num_return_sequences > 1
-        if num_return_sequences > 1 or num_beams > 1:
-            input_ids_len = input_ids.shape[-1]
-            input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len)
-            attention_mask = attention_mask.unsqueeze(1).expand(
-                batch_size, effective_batch_mult * num_beams, input_ids_len
-            )
-
-            input_ids = input_ids.contiguous().view(
-                effective_batch_size * num_beams, input_ids_len
-            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
-            attention_mask = attention_mask.contiguous().view(
-                effective_batch_size * num_beams, input_ids_len
-            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
-
-        if self.config.is_encoder_decoder:
-            # create empty decoder_input_ids
-            input_ids = torch.full(
-                (effective_batch_size * num_beams, 1),
-                decoder_start_token_id,
-                dtype=torch.long,
-                device=next(self.parameters()).device,
-            )
-            cur_len = 1
-
-            assert (
-                batch_size == encoder_outputs[0].shape[0]
-            ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} "
-
-            # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1)
-            expanded_batch_idxs = (
-                torch.arange(batch_size)
-                .view(-1, 1)
-                .repeat(1, num_beams * effective_batch_mult)
-                .view(-1)
-                .to(input_ids.device)
-            )
-            # expand encoder_outputs
-            encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:])
-
-        else:
-            encoder_outputs = None
-            cur_len = input_ids.shape[-1]
-
-        if num_beams > 1:
-            output = self._generate_beam_search(
-                input_ids,
-                cur_len=cur_len,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=do_sample,
-                early_stopping=early_stopping,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                bos_token_id=bos_token_id,
-                pad_token_id=pad_token_id,
-                decoder_start_token_id=decoder_start_token_id,
-                eos_token_id=eos_token_id,
-                batch_size=effective_batch_size,
-                num_return_sequences=num_return_sequences,
-                length_penalty=length_penalty,
-                num_beams=num_beams,
-                vocab_size=vocab_size,
-                encoder_outputs=encoder_outputs,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-            )
-        else:
-            output = self._generate_no_beam_search(
-                input_ids,
-                cur_len=cur_len,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                bos_token_id=bos_token_id,
-                pad_token_id=pad_token_id,
-                decoder_start_token_id=decoder_start_token_id,
-                eos_token_id=eos_token_id,
-                batch_size=effective_batch_size,
-                encoder_outputs=encoder_outputs,
-                attention_mask=attention_mask,
-                use_cache=use_cache,
-            )
-
-        return output
-
-    def _generate_no_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        min_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        bos_token_id,
-        pad_token_id,
-        eos_token_id,
-        decoder_start_token_id,
-        batch_size,
-        encoder_outputs,
-        attention_mask,
-        use_cache,
-    ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
-        """
-        # length of generated sentences / unfinished sentences
-        unfinished_sents = input_ids.new(batch_size).fill_(1)
-        sent_lengths = input_ids.new(batch_size).fill_(max_length)
-
-        past = encoder_outputs  # defined for encoder-decoder models, None for decoder-only models
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(
-                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache
-            )
-
-            outputs = self(**model_inputs)
-            next_token_logits = outputs[0][:, -1, :]
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._use_cache(outputs, use_cache):
-                past = outputs[1]
-
-            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                self.enforce_repetition_penalty_(next_token_logits, batch_size, 1, input_ids, repetition_penalty)
-
-            if no_repeat_ngram_size > 0:
-                # calculate a list of banned tokens to prevent repetitively generating the same ngrams
-                # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
-                for batch_idx in range(batch_size):
-                    next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf")
-
-            if bad_words_ids is not None:
-                # calculate a list of banned tokens according to bad words
-                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
-
-                for batch_idx in range(batch_size):
-                    next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf")
-
-            # set eos token prob to zero if min_length is not reached
-            if eos_token_id is not None and cur_len < min_length:
-                next_token_logits[:, eos_token_id] = -float("inf")
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    next_token_logits = next_token_logits / temperature
-                # Top-p/top-k filtering
-                next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-                # Sample
-                probs = F.softmax(next_token_logits, dim=-1)
-                next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
-            else:
-                # Greedy decoding
-                next_token = torch.argmax(next_token_logits, dim=-1)
-
-            # update generations and finished sentences
-            if eos_token_id is not None:
-                # pad finished sentences if eos_token_id exist
-                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
-            else:
-                tokens_to_add = next_token
-
-            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
-
-            if eos_token_id is not None:
-                eos_in_sents = tokens_to_add == eos_token_id
-                # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
-                is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool()
-                sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len + 1)
-                # unfinished_sents is set to zero if eos in sentence
-                unfinished_sents.mul_((~eos_in_sents).long())
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
-            if unfinished_sents.max() == 0:
-                break
-
-            # extend attention_mask for new generated input if only decoder
-            if self.config.is_encoder_decoder is False:
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-                )
-
-            cur_len = cur_len + 1
-
-        # if there are different sentences lengths in the batch, some batches have to be padded
-        if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
-            # finished sents are filled with pad_token
-            decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_(pad_token_id)
-        else:
-            decoded = input_ids
-
-        for hypo_idx, hypo in enumerate(input_ids):
-            decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]]
-
-        return decoded
-
-    def _generate_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        min_length,
-        do_sample,
-        early_stopping,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        bos_token_id,
-        pad_token_id,
-        eos_token_id,
-        decoder_start_token_id,
-        batch_size,
-        num_return_sequences,
-        length_penalty,
-        num_beams,
-        vocab_size,
-        encoder_outputs,
-        attention_mask,
-        use_cache,
-    ):
-        """ Generate sequences for each example with beam search.
-        """
-
-        # generated hypotheses
-        generated_hyps = [
-            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping)
-            for _ in range(batch_size)
-        ]
-
-        # scores for each sentence in the beam
-        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
-
-        # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
-        if do_sample is False:
-            beam_scores[:, 1:] = -1e9
-        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
-
-        # cache compute states
-        past = encoder_outputs  # defined for encoder-decoder models, None for decoder-only models
-
-        # done sentences
-        done = [False for _ in range(batch_size)]
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(
-                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache
-            )
-            outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
-            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._use_cache(outputs, use_cache):
-                past = outputs[1]
-
-            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                self.enforce_repetition_penalty_(
-                    next_token_logits, batch_size, num_beams, input_ids, repetition_penalty,
-                )
-
-            if temperature != 1.0:
-                next_token_logits = next_token_logits / temperature
-
-            scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
-            if self.config.is_encoder_decoder and do_sample is False:
-                # TODO (PVP) still a bit hacky here - there might be a better solutino
-                scores = self.prepare_scores_for_generation(scores, cur_len=cur_len, max_length=max_length)
-
-            # set eos token prob to zero if min_length is not reached
-            if eos_token_id is not None and cur_len < min_length:
-                scores[:, eos_token_id] = -float("inf")
-
-            if no_repeat_ngram_size > 0:
-                # calculate a list of banned tokens to prevent repetitively generating the same ngrams
-                num_batch_hypotheses = batch_size * num_beams
-                # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                banned_batch_tokens = calc_banned_ngram_tokens(
-                    input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len
-                )
-                for i, banned_tokens in enumerate(banned_batch_tokens):
-                    scores[i, banned_tokens] = -float("inf")
-
-            if bad_words_ids is not None:
-                # calculate a list of banned tokens according to bad words
-                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
-
-                for i, banned_tokens in enumerate(banned_tokens):
-                    scores[i, banned_tokens] = -float("inf")
-
-            assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format(
-                scores.shape, (batch_size * num_beams, vocab_size)
-            )
-
-            if do_sample:
-                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
-                # Top-p/top-k filtering
-                _scores = top_k_top_p_filtering(
-                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
-                )  # (batch_size * num_beams, vocab_size)
-                # re-organize to group the beam together to sample from all beam_idxs
-                _scores = _scores.contiguous().view(
-                    batch_size, num_beams * vocab_size
-                )  # (batch_size, num_beams * vocab_size)
-
-                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
-                probs = F.softmax(_scores, dim=-1)
-                next_tokens = torch.multinomial(probs, num_samples=2 * num_beams)  # (batch_size, num_beams * 2)
-                # Compute next scores
-                next_scores = torch.gather(_scores, -1, next_tokens)  # (batch_size, num_beams * 2)
-                # sort the sampled vector to make sure that the first num_beams samples are the best
-                next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1)
-                next_tokens = torch.gather(next_tokens, -1, next_scores_indices)  # (batch_size, num_beams * 2)
-
-            else:
-                next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
-
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                next_scores = next_scores.view(
-                    batch_size, num_beams * vocab_size
-                )  # (batch_size, num_beams * vocab_size)
-
-                next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
-
-            assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
-
-            # next batch beam content
-            next_batch_beam = []
-
-            # for each sentence
-            for batch_idx in range(batch_size):
-
-                # if we are done with this sentence
-                if done[batch_idx]:
-                    assert (
-                        len(generated_hyps[batch_idx]) >= num_beams
-                    ), "Batch can only be done if at least {} beams have been generated".format(num_beams)
-                    assert (
-                        eos_token_id is not None and pad_token_id is not None
-                    ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
-                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
-                    continue
-
-                # next sentence beam content
-                next_sent_beam = []
-
-                # next tokens for this sentence
-                for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
-                    zip(next_tokens[batch_idx], next_scores[batch_idx])
-                ):
-                    # get beam and token IDs
-                    beam_id = beam_token_id // vocab_size
-                    token_id = beam_token_id % vocab_size
-
-                    effective_beam_id = batch_idx * num_beams + beam_id
-                    # add to generated hypotheses if end of sentence or last iteration
-                    if (eos_token_id is not None) and (token_id.item() == eos_token_id):
-                        # if beam_token does not belong to top num_beams tokens, it should not be added
-                        is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
-                        if is_beam_token_worse_than_top_num_beams:
-                            continue
-                        generated_hyps[batch_idx].add(
-                            input_ids[effective_beam_id].clone(), beam_token_score.item(),
-                        )
-                    else:
-                        # add next predicted token if it is not eos_token
-                        next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
-
-                    # the beam for next step is full
-                    if len(next_sent_beam) == num_beams:
-                        break
-
-                # Check if were done so that we can save a pad step if all(done)
-                done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
-                    next_scores[batch_idx].max().item(), cur_len=cur_len
-                )
-
-                # update next beam content
-                assert len(next_sent_beam) == num_beams, "Beam should always be full"
-                next_batch_beam.extend(next_sent_beam)
-                assert len(next_batch_beam) == num_beams * (batch_idx + 1)
-
-            # stop when we are done with each sentence
-            if all(done):
-                break
-
-            # sanity check / prepare next batch
-            assert len(next_batch_beam) == batch_size * num_beams
-            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
-            beam_tokens = input_ids.new([x[1] for x in next_batch_beam])
-            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
-
-            # re-order batch
-            input_ids = input_ids[beam_idx, :]
-            input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1)
-            # re-order internal states
-            if past is not None:
-                past = self._reorder_cache(past, beam_idx)
-
-            # extend attention_mask for new generated input if only decoder
-            if self.config.is_encoder_decoder is False:
-                attention_mask = torch.cat(
-                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
-                )
-
-            # update current length
-            cur_len = cur_len + 1
-
-        # finalize all open beam hypotheses and end to generated hypotheses
-        for batch_idx in range(batch_size):
-            if done[batch_idx]:
-                continue
-
-            # test that beam scores match previously calculated scores if not eos and batch_idx not done
-            if eos_token_id is not None and all(
-                (token_id % vocab_size).item() is not eos_token_id for token_id in next_tokens[batch_idx]
-            ):
-                assert torch.all(
-                    next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
-                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
-                    next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx],
-                )
-
-            # need to add best num_beams hypotheses to generated hyps
-            for beam_id in range(num_beams):
-                effective_beam_id = batch_idx * num_beams + beam_id
-                final_score = beam_scores[effective_beam_id].item()
-                final_tokens = input_ids[effective_beam_id]
-                generated_hyps[batch_idx].add(final_tokens, final_score)
-
-        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
-        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
-        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
-
-        # select the best hypotheses
-        sent_lengths = input_ids.new(output_batch_size)
-        best = []
-
-        # retrieve best hypotheses
-        for i, hypotheses in enumerate(generated_hyps):
-            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
-            for j in range(output_num_return_sequences_per_batch):
-                effective_batch_idx = output_num_return_sequences_per_batch * i + j
-                best_hyp = sorted_hyps.pop()[1]
-                sent_lengths[effective_batch_idx] = len(best_hyp)
-                best.append(best_hyp)
-
-        # shorter batches are filled with pad_token
-        if sent_lengths.min().item() != sent_lengths.max().item():
-            assert pad_token_id is not None, "`Pad_token_id` has to be defined"
-            sent_max_len = min(sent_lengths.max().item() + 1, max_length)
-            decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id)
-
-            # fill with hypothesis and eos_token_id if necessary
-            for i, hypo in enumerate(best):
-                decoded[i, : sent_lengths[i]] = hypo
-                if sent_lengths[i] < max_length:
-                    decoded[i, sent_lengths[i]] = eos_token_id
-        else:
-            # none of the hypotheses have an eos_token
-            assert (len(hypo) == max_length for hypo in best)
-            decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device)
-
-        return decoded
-
-    # force one of token_ids to be generated by setting prob of all other tokens to 0.
-    def _force_token_ids_generation(self, scores, token_ids):
-        if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        all_but_token_ids_mask = torch.tensor(
-            [x for x in range(self.config.vocab_size) if x not in token_ids],
-            dtype=torch.long,
-            device=next(self.parameters()).device,
-        )
-        assert len(scores.shape) == 2, "scores should be of rank 2 with shape: [batch_size, vocab_size]"
-        scores[:, all_but_token_ids_mask] = -float("inf")
-
-    @staticmethod
-    def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]:
-        return tuple(layer_past.index_select(1, beam_idx) for layer_past in past)
-
-
-def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None:
-    """Copied from fairseq for no_repeat_ngram in beam_search"""
-    if cur_len + 1 < no_repeat_ngram_size:
-        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
-        return [[] for _ in range(num_hypos)]
-    generated_ngrams = [{} for _ in range(num_hypos)]
-    for idx in range(num_hypos):
-        gen_tokens = prev_input_ids[idx].tolist()
-        generated_ngram = generated_ngrams[idx]
-        for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
-            prev_ngram_tuple = tuple(ngram[:-1])
-            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
-
-    def _get_generated_ngrams(hypo_idx):
-        # Before decoding the next token, prevent decoding of ngrams that have already appeared
-        start_idx = cur_len + 1 - no_repeat_ngram_size
-        ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist())
-        return generated_ngrams[hypo_idx].get(ngram_idx, [])
-
-    banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
-    return banned_tokens
-
-
-def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
-    banned_tokens = []
-
-    def _tokens_match(prev_tokens, tokens):
-        if len(tokens) == 0:
-            # if bad word tokens is just one token always ban it
-            return True
-        if len(tokens) > len(prev_input_ids):
-            # if bad word tokens are longer then prev input_ids they can't be equal
-            return False
-
-        if prev_tokens[-len(tokens) :] == tokens:
-            # if tokens match
-            return True
-        else:
-            return False
-
-    for prev_input_ids_slice in prev_input_ids:
-        banned_tokens_slice = []
-
-        for banned_token_seq in bad_words_ids:
-            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
-                bad_words_ids
-            )
-
-            if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False:
-                # if tokens do not match continue
-                continue
-
-            banned_tokens_slice.append(banned_token_seq[-1])
-
-        banned_tokens.append(banned_tokens_slice)
-
-    return banned_tokens
-
-
-def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
-    """
-    if top_k > 0:
-        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-        logits[indices_to_remove] = filter_value
-
-    if top_p < 1.0:
-        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-
-        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > top_p
-        if min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-        sorted_indices_to_remove[..., 0] = 0
-
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-        logits[indices_to_remove] = filter_value
-    return logits
-
-
-class BeamHypotheses(object):
-    def __init__(self, num_beams, max_length, length_penalty, early_stopping):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.max_length = max_length - 1  # ignoring bos_token
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.num_beams = num_beams
-        self.beams = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.beams)
-
-    def add(self, hyp, sum_logprobs):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / len(hyp) ** self.length_penalty
-        if len(self) < self.num_beams or score > self.worst_score:
-            self.beams.append((score, hyp))
-            if len(self) > self.num_beams:
-                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
-                del self.beams[sorted_scores[0][1]]
-                self.worst_score = sorted_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs, cur_len=None):
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated
-        can become better than the worst one in the heap, then we are done with this sentence.
-        """
-
-        if len(self) < self.num_beams:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            if cur_len is None:
-                cur_len = self.max_length
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
-            ret = self.worst_score >= cur_score
-            return ret
-
-
-class Conv1D(nn.Module):
-    def __init__(self, nf, nx):
-        """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
-            Basically works like a Linear layer but the weights are transposed
-        """
-        super().__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
-
-class PoolerStartLogits(nn.Module):
-    """ Compute SQuAD start_logits from sequence hidden states. """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, 1)
-
-    def forward(self, hidden_states, p_mask=None):
-        """ Args:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
-                invalid position mask such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        x = self.dense(hidden_states).squeeze(-1)
-
-        if p_mask is not None:
-            if next(self.parameters()).dtype == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerEndLogits(nn.Module):
-    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dense_1 = nn.Linear(config.hidden_size, 1)
-
-    def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
-        """ Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
-                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            slen, hsz = hidden_states.shape[-2:]
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
-            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
-
-        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
-        x = self.activation(x)
-        x = self.LayerNorm(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        if p_mask is not None:
-            if next(self.parameters()).dtype == torch.float16:
-                x = x * (1 - p_mask) - 65500 * p_mask
-            else:
-                x = x * (1 - p_mask) - 1e30 * p_mask
-
-        return x
-
-
-class PoolerAnswerClass(nn.Module):
-    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
-
-    def __init__(self, config):
-        super().__init__()
-        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
-        self.activation = nn.Tanh()
-        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
-
-    def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
-        """
-        Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span.
-            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
-                position of the CLS token. If None, take the last token.
-            note(Original repo):
-                no dependency on end_feature so that we can obtain one single `cls_logits`
-                for each sample
-        """
-        hsz = hidden_states.shape[-1]
-        assert (
-            start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
-
-        if cls_index is not None:
-            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
-            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
-        else:
-            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
-
-        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
-        x = self.activation(x)
-        x = self.dense_1(x).squeeze(-1)
-
-        return x
-
-
-class SQuADHead(nn.Module):
-    r""" A SQuAD head inspired by XLNet.
-    Parameters:
-        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
-    Inputs:
-        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
-            hidden states of sequence tokens
-        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            position of the first token for the labeled span.
-        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            position of the last token for the labeled span.
-        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
-            position of the CLS token. If None, take the last token.
-        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
-            Whether the question has a possible answer in the paragraph or not.
-        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
-            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
-            1.0 means token should be masked.
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
-        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
-            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
-            Indices for the top config.start_n_top start token possibilities (beam-search).
-        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
-            ``torch.FloatTensor`` of shape ``(batch_size,)``
-            Log probabilities for the ``is_impossible`` label of the answers.
-    """
-
-    def __init__(self, config):
-        super().__init__()
-        self.start_n_top = config.start_n_top
-        self.end_n_top = config.end_n_top
-
-        self.start_logits = PoolerStartLogits(config)
-        self.end_logits = PoolerEndLogits(config)
-        self.answer_class = PoolerAnswerClass(config)
-
-    def forward(
-        self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
-    ):
-        outputs = ()
-
-        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
-
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, let's remove the dimension added by batch splitting
-            for x in (start_positions, end_positions, cls_index, is_impossible):
-                if x is not None and x.dim() > 1:
-                    x.squeeze_(-1)
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
-
-            loss_fct = CrossEntropyLoss()
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-
-            if cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
-                loss_fct_cls = nn.BCEWithLogitsLoss()
-                cls_loss = loss_fct_cls(cls_logits, is_impossible)
-
-                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
-                total_loss += cls_loss * 0.5
-
-            outputs = (total_loss,) + outputs
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = hidden_states.size()
-            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = torch.topk(
-                start_log_probs, self.start_n_top, dim=-1
-            )  # shape (bsz, start_n_top)
-            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
-            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
-            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
-
-            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
-                start_states
-            )  # shape (bsz, slen, start_n_top, hsz)
-            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
-            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
-            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
-
-            end_top_log_probs, end_top_index = torch.topk(
-                end_log_probs, self.end_n_top, dim=1
-            )  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
-            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
-
-            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
-            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
-
-            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits,) + outputs
-
-        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
-        # or (if labels are provided) (total_loss,)
-        return outputs
-
-
-class SequenceSummary(nn.Module):
-    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
-        Args of the config class:
-            summary_type:
-                - 'last' => [default] take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj: Add a projection after the vector extraction
-            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default
-            summary_first_dropout: Add a dropout before the projection and activation
-            summary_last_dropout: Add a dropout after the projection and activation
-    """
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-
-        self.summary_type = getattr(config, "summary_type", "last")
-        if self.summary_type == "attn":
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.summary = Identity()
-        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
-            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = nn.Linear(config.hidden_size, num_classes)
-
-        activation_string = getattr(config, "summary_activation", None)
-        self.activation: Callable = (get_activation(activation_string) if activation_string else Identity())
-
-        self.first_dropout = Identity()
-        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
-            self.first_dropout = nn.Dropout(config.summary_first_dropout)
-
-        self.last_dropout = Identity()
-        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
-            self.last_dropout = nn.Dropout(config.summary_last_dropout)
-
-    def forward(self, hidden_states, cls_index=None):
-        """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
-            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
-                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
-                if summary_type == 'cls_index' and cls_index is None:
-                    we take the last token of the sequence as classification token
-        """
-        if self.summary_type == "last":
-            output = hidden_states[:, -1]
-        elif self.summary_type == "first":
-            output = hidden_states[:, 0]
-        elif self.summary_type == "mean":
-            output = hidden_states.mean(dim=1)
-        elif self.summary_type == "cls_index":
-            if cls_index is None:
-                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,)
-            else:
-                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
-                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
-            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
-        elif self.summary_type == "attn":
-            raise NotImplementedError
-
-        output = self.first_dropout(output)
-        output = self.summary(output)
-        output = self.activation(output)
-        output = self.last_dropout(output)
-
-        return output
-
-
-def create_position_ids_from_input_ids(input_ids, padding_idx):
-    """ Replace non-padding symbols with their position numbers. Position numbers begin at
-    padding_idx+1. Padding symbols are ignored. This is modified from fairseq's
-    `utils.make_positions`.
-    :param torch.Tensor x:
-    :return torch.Tensor:
-    """
-    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
-    mask = input_ids.ne(padding_idx).int()
-    incremental_indicies = torch.cumsum(mask, dim=1).type_as(mask) * mask
-    return incremental_indicies.long() + padding_idx
-
-
-def prune_linear_layer(layer, index, dim=0):
-    """ Prune a linear layer (a model parameters) to keep only entries in index.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if layer.bias is not None:
-        if dim == 1:
-            b = layer.bias.clone().detach()
-        else:
-            b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    if layer.bias is not None:
-        new_layer.bias.requires_grad = False
-        new_layer.bias.copy_(b.contiguous())
-        new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_conv1d_layer(layer, index, dim=1):
-    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
-        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if dim == 0:
-        b = layer.bias.clone().detach()
-    else:
-        b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.copy_(b.contiguous())
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def prune_layer(layer, index, dim=None):
-    """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    if isinstance(layer, nn.Linear):
-        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
-    elif isinstance(layer, Conv1D):
-        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
-    else:
-        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
diff --git a/torch_bert/test.py b/torch_bert/test.py
deleted file mode 100644
index 98ea690..0000000
--- a/torch_bert/test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-class Test:
-    # kwargs pop test - 200420
-    def __init__(self, **kwargs):
-        print(kwargs)
-        self.pos = kwargs.pop('pos', 5)
-        self.image = kwargs.pop('image', 'i love you')
-
-    # class_method argument feeding test - 200420
-    @classmethod
-    def from_pretrained(cls, *input, **kwargs):
-        return cls._from_pretrained(*input, **kwargs)
-
-    @classmethod
-    def _from_pretrained(cls, pretrained_model_name, cache_dir=None, *input, **kwargs):
-        print(cls.prep)
-        print(pretrained_model_name)
-        print(cache_dir)
-        return None
-
-class BertTest(Test):
-
-    prep = ['lol lol lol']
-
-    def __init__(self, **kwargs):
-        super().__init__()
-
-
-if __name__ == '__main__':
-    B = BertTest()
-    B.from_pretrained('a')
-    print(5 * [0])
diff --git a/torch_bert/tokenization_bert.py b/torch_bert/tokenization_bert.py
deleted file mode 100644
index f8f40ec..0000000
--- a/torch_bert/tokenization_bert.py
+++ /dev/null
@@ -1,495 +0,0 @@
-# https://mrcoding.tistory.com/entry/아톰에서-파이썬-스크립트-실행시-한글-깨짐현상-잡는-꿀팁
-import sys
-import io
-sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8')
-sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8')
-
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# 형태소분석 기반 BERT를 위한 Tokenization Class
-# 수정: joonho.lim
-# 일자: 2019-05-23
-#
-#
-# Morph와 Eojeol 버전 통합
-# 수정: MyungHoon.jin
-# 일자: 2020-04-20
-
-import collections
-import logging
-import os
-import unicodedata
-from typing import List, Optional
-
-from tokenization_utils import PretrainedTokenizer
-
-# Huggingface 소스 파일
-# VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-#
-# PRETRAINED_VOCAB_FILES_MAP = {
-#     "vocab_file": {
-#         "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-#         "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-#         "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-#         "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-#         "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-#         "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-#         "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-#         "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-#         "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-#         "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-#         "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-#         "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-#         "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-#         "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
-#         "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-#         "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
-#         "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
-#         "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
-#     }
-# }
-#
-# PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-#     "bert-base-uncased": 512,
-#     "bert-large-uncased": 512,
-#     "bert-base-cased": 512,
-#     "bert-large-cased": 512,
-#     "bert-base-multilingual-uncased": 512,
-#     "bert-base-multilingual-cased": 512,
-#     "bert-base-chinese": 512,
-#     "bert-base-german-cased": 512,
-#     "bert-large-uncased-whole-word-masking": 512,
-#     "bert-large-cased-whole-word-masking": 512,
-#     "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
-#     "bert-large-cased-whole-word-masking-finetuned-squad": 512,
-#     "bert-base-cased-finetuned-mrpc": 512,
-#     "bert-base-german-dbmdz-cased": 512,
-#     "bert-base-german-dbmdz-uncased": 512,
-#     "bert-base-finnish-cased-v1": 512,
-#     "bert-base-finnish-uncased-v1": 512,
-#     "bert-base-dutch-cased": 512,
-# }
-#
-# PRETRAINED_INIT_CONFIGURATION = {
-#     "bert-base-uncased": {"do_lower_case": True},
-#     "bert-large-uncased": {"do_lower_case": True},
-#     "bert-base-cased": {"do_lower_case": False},
-#     "bert-large-cased": {"do_lower_case": False},
-#     "bert-base-multilingual-uncased": {"do_lower_case": True},
-#     "bert-base-multilingual-cased": {"do_lower_case": False},
-#     "bert-base-chinese": {"do_lower_case": False},
-#     "bert-base-german-cased": {"do_lower_case": False},
-#     "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
-#     "bert-large-cased-whole-word-masking": {"do_lower_case": False},
-#     "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
-#     "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
-#     "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
-#     "bert-base-german-dbmdz-cased": {"do_lower_case": False},
-#     "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
-#     "bert-base-finnish-cased-v1": {"do_lower_case": False},
-#     "bert-base-finnish-uncased-v1": {"do_lower_case": True},
-#     "bert-base-dutch-cased": {"do_lower_case": False},
-# }
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    'bert-base-uncased': 512,
-    'bert-large-uncased': 512,
-    'bert-base-cased': 512,
-    'bert-large-cased': 512,
-    'bert-base-multilingual-uncased': 512,
-    'bert-base-multilingual-cased': 512,
-    'bert-base-chinese': 512,
-}
-VOCAB_NAME = 'vocab.txt'
-
-
-def load_vocab(vocab_file, encoding="utf-8"):
-    vocab = collections.OrderedDict()
-    index = 0
-    # huggingface 코드에서는 단순하게 `.readlines()` 메서드로 구현
-    with open(vocab_file, "r", encoding=encoding) as reader:
-        while True:
-            token = reader.readline()
-            # token = convert_to_unicode(token)
-            if not token:
-                break
-            # ETRI Vocab을 위한 코드
-            if token.find('n_iters=') == 0 or token.find('max_length=') == 0:
-                continue
-            # index 1은 빈도수, 빈도수가 제일 높은 token부터 numbering
-            token = token.split('\t')[0].strip()
-            vocab[token] = index
-            index += 1
-    return vocab
-
-
-# text 단위 공백 처리
-def whitespace_tokenize(text):
-    """Run basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class BertTokenizer(PretrainedTokenizer):
-
-    vocab_file_names = VOCAB_NAME
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_ARCHIVE_MAP
-    # pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
-
-    def __init__(self,
-                 vocab_file,
-                 do_lower_case=False,
-                 do_basic_tokenize=True,
-                 never_split=None,
-                 unk_token="[UNK]",
-                 sep_token="[SEP]",
-                 pad_token="[PAD]",
-                 cls_token="[CLS]",
-                 mask_token="[MASK]",
-                 **kwargs):
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-            )
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, token) for token, ids in self.vocab.items()]
-        )
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(
-            vocab=self.vocab,
-            unk_token=self.unk_token
-        )
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        # added_tokens_encoder는 추가할 때 필요, default == {}
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                token += '_' # ETRI BERT에서의 차이점.
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def build_inputs_with_special_tokens(self,
-        token_ids_0: List[int], token_ids_1: Optional[List[int]]=None
-        ) -> List[int]:
-        """
-        sequence 분류 task를 위한 model input build!
-        - single sequence: ``[CLS] A [SEP]``
-        - pair of sequence: ``[CLS] A [SEP] B [SEP]``
-        """
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self,
-        token_ids_0: List[int], token_ids_1: Optional[List[int]]=None,
-        already_has_special_tokens: bool=False) -> List[int]:
-        """
-        special token이 추가되지 않은 list에서 sequence ids를 검색
-        ``prepare_for_model``, ``encode_plus`` 메서드로 special tokens을
-        추가할 때 호출됨
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(self,
-        token_ids_0: List[int], token_ids_1: Optional[List[int]]=None
-        ) -> List[int]:
-        """
-        sequence pair 분류 문제를 위해 concat mask를 생성
-
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-            만일 token_ids_1이 None이면 0으로 채워진 mask를 반환
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, vocab_path):
-        pass
-
-
-class BasicTokenizer:
-
-    """Run basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self, do_lower_case=False, never_split=[],
-                 tokenize_chinese_chars=True):
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-
-    def tokenize(self, text, never_split=[]):
-        never_split = self.never_split + never_split
-        text = self._clean_text(text)
-        # Chinese Char은 무시한다.
-        orig_token = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case and token not in self.never_split:
-                # 형태소 분석기를 사용할 경우 do_lower_case를 False로 설정할 것.
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-        output_tokens = whitespace_tokenize(" ".join(split_token))
-        return output_tokens
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = [] # char을 저장한 list 생성
-        for char in text:
-            # 텍스트에서 char 단위로 출력
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or self._is_control(char):
-                # \x00이거나 �이거나 unicode cat.이 C로 시작할 경우
-                # (개행문자 제외) output에 추가하지 않는다.
-                continue
-            if self._is_whitespace(char):
-                # 공백일 경우 " "으로 output에 추가
-                output.append(" ")
-            else:
-                # 이 외의 경우 전부 output에 추가
-                output.append(char)
-        # cleaning 작업을 거친 text를 후처리하여 반환
-        return "".join(output)
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d
-        # 모든 음절을 정준 분해(Canonical Decomposition)시킴
-        # `각`을 `ㄱ+ㅏ+ㄱ`으로 저장(출력되는 값은 동일)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                # unicode category가 "Mark, Nonspacing"일 경우 pass
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        # 근데 사실상 whitespacing을 하고 ETIR가 _is_punctuation 함수를
-        # 띄어쓰기만 검색하도록 만들어놔서 사실 의미없음 ㅇㅅㅇ
-        if never_split is not None and text in never_split:
-            return [text]
-        chars = list(text)
-        i, start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if self._is_punctuation(char):
-                # 구두점일 경우  [char}을 추가하고 새로운 단어로 시작
-                output.append([char])
-                start_new_word = True
-            else:
-                # 구두점이 아닐 경우
-                if start_new_word:
-                    # 새로운 단어로 시작할 경우에 빈 리스트 추가
-                    output.append([])
-                # 해당 문자부터 시작하도록 start_new_word는 False로 setting
-                start_new_word = False
-                # 위에 추가한 빈 리스트에 각각 character를 채워넣음
-                output[-1].append(char)
-            i += 1
-        return ["".join(x) for x in output]
-
-    # char 단위 함수들 ------------------------------------------------------
-    @staticmethod
-    def _is_whitespace(char):
-        """Checks whether `chars` is a whitespace character"""
-        # \t, \n, \r은 technically control characters지만
-        # whiteapce로 여기고 이를 처리
-        if char == " " or char == '\t' or char == '\n' or char == '\r':
-            return True
-        cat = unicodedata.category(char)
-        if cat == 'Zs':
-            # unicode category가 Space Seperator면 True 반환
-            return True
-        # 이 외의 경우 전부 False 반환
-        return False
-
-    @staticmethod
-    def _is_control(char):
-        """Checks whether `chars` is a control character"""
-        if char == "\t" or char == "\n" or char == "\r":
-            # \t, \n, \r을 우리는 whitespace로 처리함
-            return False
-        cat = unicodedata.category(char)
-        if cat.startswith("C"):
-            # unicode category가
-            # Cc(Control)
-            # Cf(format)
-            # Co(Private Use, is 0)
-            # Cs(Surrrogate, is 0)일 경우, True 반환
-            return True
-        # 이 외의 경우 전부 False 반환
-        return False
-
-    @staticmethod
-    def _is_punctuation(char):
-        """Checks whether `chars` is a punctuatoin character."""
-        # 왜 때문인지 모르겠지만 ETRI에서 아래부분을 주석처리해버림
-        # 구두점을 띄어쓰기만 고려? 흠...
-        return char == ' '
-
-        cp = ord(char)
-        # 모든 non-letter/number ASCII를 구두점으로 처리
-        # "^", "$", "`"와 같은 char은 unicode에 없음
-        # 그러나 이를 일관성있게 punctuation으로 처리하기 위해 아래와 같이 처리
-        if ((cp >= 33 and cp <= 47) or
-            (cp >= 58 and cp <= 64) or
-            (cp >= 91 and cp <= 96) or
-            (cp >= 123 and cp <= 126)):
-            return True
-        cat = unicodedata.category(char)
-        if cat.startswith("P"):
-            return True
-        return False
-
-
-class WordpieceTokenizer:
-
-    """Runs WordPiece tokenization"""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        greedy longest-match-first algorithm을 사용하여
-        주어진 vocab으로 tokenization을 수행
-
-        20.04.20
-        - 여기에 기능 추가해야함!! -> 없는 토큰 추가 학습하도록
-        - 미리 빼둬야함!!
-        """
-        # text = convert_to_unicode(text)
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                # max word로 설정한 글자 수를 넘길 경우 [UNK] 처리
-                output.tokens.append(self.unk_token)
-                continue
-            is_bad, start = False, 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                # 첫 번째 글자부터 천천히 vocab에 있는 단어인지 체크
-                # 맨 처음에는 해당 token자체가 이미 있는지 체크! (때문에 longest)
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    # Canonical Decomposition 과정을 거쳤기 때문에
-                    # 이를 다시 Composition해줘야 vocab의 단어와 비교 가능
-                    substr = unicodedata.normalize("NFC", substr)
-                    #
-                    # if start > 0:
-                    #     substr = "##" + substr
-                    if substr in self.vocab:
-                        # 만일 해당 단어가 vocab에 있으면 해당 단어로 break
-                        cur_substr = substr
-                        break
-                    end -= 1
-                # 만일 어떠한 단어랑도 매칭되지 않았다면 (1)로 가서 [UNK] 처리
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                # 어미, 혹은 다른 사전에 있는 단어를 찾기 위해 start에 end값 할당
-                start = end
-            if is_bad: # --- (1)
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-if __name__ == '__main__':
-
-    file_path = "E:/KorBERT/1_bert_download_001_bert_morp_pytorch/001_bert_morp_pytorch"
-    vocab_file = file_path + '/vocab.korean_morp.list'
-    B = BertTokenizer(vocab_file=vocab_file, max_len=100000)
-    print(B.unk_token)
-    print(B.all_special_tokens)
-    print(B.max_len)
-    print(B.vocab[B.unk_token])
-    print(B._convert_token_to_id('다/EF_'))
-    print(B.cls_token_id, B.sep_token_id)
-    print(B.cls_token)
-    print(B.vocab['모란/NNG_'])
diff --git a/torch_bert/tokenization_morp.py b/torch_bert/tokenization_morp.py
deleted file mode 100644
index 26b9f3c..0000000
--- a/torch_bert/tokenization_morp.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#	 http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-#
-# 형태소분석 기반 BERT를 위한 Tokenization Class
-# 수정: joonho.lim
-# 일자: 2019-05-23
-#
-"""Tokenization classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import unicodedata
-import os
-import logging
-
-from .file_utils import cached_path
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-	'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-	'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-	'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-	'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-	'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-	'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-	'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-	'bert-base-uncased': 512,
-	'bert-large-uncased': 512,
-	'bert-base-cased': 512,
-	'bert-large-cased': 512,
-	'bert-base-multilingual-uncased': 512,
-	'bert-base-multilingual-cased': 512,
-	'bert-base-chinese': 512,
-}
-VOCAB_NAME = 'vocab.txt'
-
-
-def load_vocab(vocab_file):
-	"""Loads a vocabulary file into a dictionary."""
-	vocab = collections.OrderedDict()
-	index = 0
-	with open(vocab_file, "r", encoding="utf-8") as reader:
-		while True:
-			token = reader.readline()
-			if not token:
-				break
-
-			### joonho.lim @ 2019-03-15
-			if token.find('n_iters=') == 0 or token.find('max_length=') == 0 :
-				continue
-			token = token.split('\t')[0]
-
-			token = token.strip()
-			vocab[token] = index
-			index += 1
-	return vocab
-
-
-def whitespace_tokenize(text):
-	"""Runs basic whitespace cleaning and splitting on a peice of text."""
-	text = text.strip()
-	if not text:
-		return []
-	tokens = text.split()
-	return tokens
-
-
-class BertTokenizer(object):
-	"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-
-	def __init__(self, vocab_file, do_lower_case=True, max_len=None,
-				 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-		if not os.path.isfile(vocab_file):
-			raise ValueError(
-				"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-				"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-		self.vocab = load_vocab(vocab_file)
-		self.ids_to_tokens = collections.OrderedDict(
-			[(ids, tok) for tok, ids in self.vocab.items()])
-		self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-											  never_split=never_split)
-		self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-		self.max_len = max_len if max_len is not None else int(1e12)
-
-	def tokenize(self, text):
-		split_tokens = []
-		for token in self.basic_tokenizer.tokenize(text):
-			### joonho.lim @ 2019-03-15
-			token += '_'
-			for sub_token in self.wordpiece_tokenizer.tokenize(token):
-				split_tokens.append(sub_token)
-		return split_tokens
-
-	def convert_tokens_to_ids(self, tokens):
-		"""Converts a sequence of tokens into ids using the vocab."""
-		ids = []
-		for token in tokens:
-			ids.append(self.vocab[token])
-		if len(ids) > self.max_len:
-			raise ValueError(
-				"Token indices sequence length is longer than the specified maximum "
-				" sequence length for this BERT model ({} > {}). Running this"
-				" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
-			)
-		return ids
-
-	def convert_ids_to_tokens(self, ids):
-		"""Converts a sequence of ids in wordpiece tokens using the vocab."""
-		tokens = []
-		for i in ids:
-			tokens.append(self.ids_to_tokens[i])
-		return tokens
-
-	@classmethod
-	def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
-		"""
-		Instantiate a PreTrainedBertModel from a pre-trained model file.
-		Download and cache the pre-trained model file if needed.
-		"""
-		if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
-			vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
-		else:
-			vocab_file = pretrained_model_name
-		if os.path.isdir(vocab_file):
-			vocab_file = os.path.join(vocab_file, VOCAB_NAME)
-		# redirect to the cache, if necessary
-		try:
-			resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-		except FileNotFoundError:
-			logger.error(
-				"Model name '{}' was not found in model name list ({}). "
-				"We assumed '{}' was a path or url but couldn't find any file "
-				"associated to this path or url.".format(
-					pretrained_model_name,
-					', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-					vocab_file))
-			return None
-		if resolved_vocab_file == vocab_file:
-			logger.info("loading vocabulary file {}".format(vocab_file))
-		else:
-			logger.info("loading vocabulary file {} from cache at {}".format(
-				vocab_file, resolved_vocab_file))
-		if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-			# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-			# than the number of positional embeddings
-			max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
-			kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-		# Instantiate tokenizer.
-		tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-		return tokenizer
-
-
-class BasicTokenizer(object):
-	"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-	def __init__(self,
-				 do_lower_case=True,
-				 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-		"""Constructs a BasicTokenizer.
-
-		Args:
-		  do_lower_case: Whether to lower case the input.
-		"""
-		self.do_lower_case = do_lower_case
-		self.never_split = never_split
-
-	def tokenize(self, text):
-		"""Tokenizes a piece of text."""
-		text = self._clean_text(text)
-		### joonho.lim @ 2019-03-15
-		# # # This was added on November 1st, 2018 for the multilingual and Chinese
-		# # # models. This is also applied to the English models now, but it doesn't
-		# # # matter since the English models were not trained on any Chinese data
-		# # # and generally don't have any Chinese data in them (there are Chinese
-		# # # characters in the vocabulary because Wikipedia does have some Chinese
-		# # # words in the English Wikipedia.).
-		# # text = self._tokenize_chinese_chars(text)
-		orig_tokens = whitespace_tokenize(text)
-		split_tokens = []
-		for token in orig_tokens:
-			if self.do_lower_case and token not in self.never_split:
-				token = token.lower()
-				token = self._run_strip_accents(token)
-			split_tokens.extend(self._run_split_on_punc(token))
-
-		output_tokens = whitespace_tokenize(" ".join(split_tokens))
-		return output_tokens
-
-	def _run_strip_accents(self, text):
-		"""Strips accents from a piece of text."""
-		text = unicodedata.normalize("NFD", text)
-		output = []
-		for char in text:
-			cat = unicodedata.category(char)
-			if cat == "Mn":
-				continue
-			output.append(char)
-		return "".join(output)
-
-	def _run_split_on_punc(self, text):
-		"""Splits punctuation on a piece of text."""
-		if text in self.never_split:
-			return [text]
-		chars = list(text)
-		i = 0
-		start_new_word = True
-		output = []
-		while i < len(chars):
-			char = chars[i]
-			if _is_punctuation(char):
-				output.append([char])
-				start_new_word = True
-			else:
-				if start_new_word:
-					output.append([])
-				start_new_word = False
-				output[-1].append(char)
-			i += 1
-
-		return ["".join(x) for x in output]
-
-	def _tokenize_chinese_chars(self, text):
-		"""Adds whitespace around any CJK character."""
-		output = []
-		for char in text:
-			cp = ord(char)
-			if self._is_chinese_char(cp):
-				output.append(" ")
-				output.append(char)
-				output.append(" ")
-			else:
-				output.append(char)
-		return "".join(output)
-
-	def _is_chinese_char(self, cp):
-		"""Checks whether CP is the codepoint of a CJK character."""
-		# This defines a "chinese character" as anything in the CJK Unicode block:
-		#   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-		#
-		# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-		# despite its name. The modern Korean Hangul alphabet is a different block,
-		# as is Japanese Hiragana and Katakana. Those alphabets are used to write
-		# space-separated words, so they are not treated specially and handled
-		# like the all of the other languages.
-		if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-				(cp >= 0x3400 and cp <= 0x4DBF) or  #
-				(cp >= 0x20000 and cp <= 0x2A6DF) or  #
-				(cp >= 0x2A700 and cp <= 0x2B73F) or  #
-				(cp >= 0x2B740 and cp <= 0x2B81F) or  #
-				(cp >= 0x2B820 and cp <= 0x2CEAF) or
-				(cp >= 0xF900 and cp <= 0xFAFF) or  #
-				(cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-			return True
-
-		return False
-
-	def _clean_text(self, text):
-		"""Performs invalid character removal and whitespace cleanup on text."""
-		output = []
-		for char in text:
-			cp = ord(char)
-			if cp == 0 or cp == 0xfffd or _is_control(char):
-				continue
-			if _is_whitespace(char):
-				output.append(" ")
-			else:
-				output.append(char)
-		return "".join(output)
-
-
-class WordpieceTokenizer(object):
-	"""Runs WordPiece tokenization."""
-
-	def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-		self.vocab = vocab
-		self.unk_token = unk_token
-		self.max_input_chars_per_word = max_input_chars_per_word
-
-	def tokenize(self, text):
-		"""Tokenizes a piece of text into its word pieces.
-
-		This uses a greedy longest-match-first algorithm to perform tokenization
-		using the given vocabulary.
-
-		For example:
-		  input = "unaffable"
-		  output = ["un", "##aff", "##able"]
-
-		Args:
-		  text: A single token or whitespace separated tokens. This should have
-			already been passed through `BasicTokenizer`.
-
-		Returns:
-		  A list of wordpiece tokens.
-		"""
-
-		output_tokens = []
-		for token in whitespace_tokenize(text):
-			chars = list(token)
-			if len(chars) > self.max_input_chars_per_word:
-				output_tokens.append(self.unk_token)
-				continue
-
-			is_bad = False
-			start = 0
-			sub_tokens = []
-			while start < len(chars):
-				end = len(chars)
-				cur_substr = None
-				while start < end:
-					substr = "".join(chars[start:end])
-					### joonho.lim @ 2019-03-15
-					# if start > 0:
-						# substr = "##" + substr
-					if substr in self.vocab:
-						cur_substr = substr
-						break
-					end -= 1
-				if cur_substr is None:
-					is_bad = True
-					break
-				sub_tokens.append(cur_substr)
-				start = end
-
-			if is_bad:
-				output_tokens.append(self.unk_token)
-			else:
-				output_tokens.extend(sub_tokens)
-		return output_tokens
-
-
-def _is_whitespace(char):
-	"""Checks whether `chars` is a whitespace character."""
-	# \t, \n, and \r are technically contorl characters but we treat them
-	# as whitespace since they are generally considered as such.
-	if char == " " or char == "\t" or char == "\n" or char == "\r":
-		return True
-	cat = unicodedata.category(char)
-	if cat == "Zs":
-		return True
-	return False
-
-
-def _is_control(char):
-	"""Checks whether `chars` is a control character."""
-	# These are technically control characters but we count them as whitespace
-	# characters.
-	if char == "\t" or char == "\n" or char == "\r":
-		return False
-	cat = unicodedata.category(char)
-	if cat.startswith("C"):
-		return True
-	return False
-
-
-def _is_punctuation(char):
-	### joonho.lim @ 2019-03-15
-	return char == ' '
-
-	# """Checks whether `chars` is a punctuation character."""
-	# cp = ord(char)
-	# # We treat all non-letter/number ASCII as punctuation.
-	# # Characters such as "^", "$", and "`" are not in the Unicode
-	# # Punctuation class but we treat them as punctuation anyways, for
-	# # consistency.
-	# if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-			# (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-		# return True
-	# cat = unicodedata.category(char)
-	# if cat.startswith("P"):
-		# return True
-	# return False
diff --git a/torch_bert/tokenization_utils.py b/torch_bert/tokenization_utils.py
deleted file mode 100644
index 7922076..0000000
--- a/torch_bert/tokenization_utils.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# ref: https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py
-from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
-import logging
-from file_utils import cached_path
-
-logger = logging.getLogger(__name__)
-
-class SpecialTokenMixin:
-
-    """Token에 관련된 행동들을 Handling"""
-
-    SPECIAL_TOKENS_ATTRIBUTES = [
-        "bos_token",
-        "eos_token",
-        "unk_token",
-        "sep_token",
-        "pad_token",
-        "cls_token",
-        "mask_token",
-        "additional_special_tokens",
-    ]
-
-    def __init__(self, **kwargs):
-        self._bos_token = None
-        self._eos_token = None
-        self._unk_token = None
-        self._sep_token = None
-        self._pad_token = None
-        self._cls_token = None
-        self._mask_token = None
-        self._pad_token_type_id = 0
-        self._additional_special_tokens = []
-
-        for key, value in kwargs.items():
-            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                if key == "additional_special_tokens":
-                    assert isinstance(value, (list, tuple)) and \
-                           all(isinstance(t, str) for t in value)
-                # elif isinstance(value, AddedTokenFast):
-                #     setattr(self, key, str(value))
-                elif isinstance(value, str):
-                    setattr(self, key, value)
-                else:
-                    raise TypeError(
-                        "special token {} has to be either str or AddedTokenFast but got: {}".format(key, type(value))
-                    )
-
-    @property
-    def bos_token(self):
-        """ Beginning of sentence token (string). Log an error if used while not having been set. """
-        if self._bos_token is None:
-            logger.error("Using bos_token, but it is not set yet.")
-        return self._bos_token
-
-    @property
-    def eos_token(self):
-        """ End of sentence token (string). Log an error if used while not having been set. """
-        if self._eos_token is None:
-            logger.error("Using eos_token, but it is not set yet.")
-        return self._eos_token
-
-    @property
-    def unk_token(self):
-        """ Unknown token (string). Log an error if used while not having been set. """
-        if self._unk_token is None:
-            logger.error("Using unk_token, but it is not set yet.")
-        return self._unk_token
-
-    @property
-    def sep_token(self):
-        """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
-        if self._sep_token is None:
-            logger.error("Using sep_token, but it is not set yet.")
-        return self._sep_token
-
-    @property
-    def pad_token(self):
-        """ Padding token (string). Log an error if used while not having been set. """
-        if self._pad_token is None:
-            logger.error("Using pad_token, but it is not set yet.")
-        return self._pad_token
-
-    @property
-    def cls_token(self):
-        """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
-        if self._cls_token is None:
-            logger.error("Using cls_token, but it is not set yet.")
-        return self._cls_token
-
-    @property
-    def mask_token(self):
-        """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
-        if self._mask_token is None:
-            logger.error("Using mask_token, but it is not set yet.")
-        return self._mask_token
-
-    @property
-    def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
-        if self._additional_special_tokens is None:
-            logger.error("Using additional_special_tokens, but it is not set yet.")
-        return self._additional_special_tokens
-
-    def _maybe_update_backend(self, value):
-        """ To be overriden by derived class if a backend tokenizer has to be updated. """
-        pass
-
-    @bos_token.setter
-    def bos_token(self, value):
-        self._bos_token = value
-        self._maybe_update_backend([value])
-
-    @eos_token.setter
-    def eos_token(self, value):
-        self._eos_token = value
-        self._maybe_update_backend([value])
-
-    @unk_token.setter
-    def unk_token(self, value):
-        self._unk_token = value
-        self._maybe_update_backend([value])
-
-    @sep_token.setter
-    def sep_token(self, value):
-        self._sep_token = value
-        self._maybe_update_backend([value])
-
-    @pad_token.setter
-    def pad_token(self, value):
-        self._pad_token = value
-        self._maybe_update_backend([value])
-
-    @cls_token.setter
-    def cls_token(self, value):
-        self._cls_token = value
-        self._maybe_update_backend([value])
-
-    @mask_token.setter
-    def mask_token(self, value):
-        self._mask_token = value
-        self._maybe_update_backend([value])
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
-        self._maybe_update_backend(value)
-
-    @property
-    def bos_token_id(self):
-        """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.bos_token)
-
-    @property
-    def eos_token_id(self):
-        """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.eos_token)
-
-    @property
-    def unk_token_id(self):
-        """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.unk_token)
-
-    @property
-    def sep_token_id(self):
-        """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.sep_token)
-
-    @property
-    def pad_token_id(self):
-        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.pad_token)
-
-    @property
-    def pad_token_type_id(self):
-        """ Id of the padding token type in the vocabulary."""
-        return self._pad_token_type_id
-
-    @property
-    def cls_token_id(self):
-        """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.cls_token)
-
-    @property
-    def mask_token_id(self):
-        """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.mask_token)
-
-    @property
-    def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.additional_special_tokens)
-
-    @property
-    def special_tokens_map(self):
-        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
-            values ('<unk>', '<cls>'...)
-        """
-        set_attr = {}
-        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, "_" + attr)
-            if attr_value:
-                set_attr[attr] = attr_value
-        return set_attr
-
-    @property
-    def all_special_tokens(self):
-        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
-            (cls_token, unk_token...).
-        """
-        all_toks = []
-        set_attr = self.special_tokens_map
-        print(set_attr)
-        for attr_value in set_attr.values():
-            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
-        all_toks = list(set(all_toks))
-        return all_toks
-
-    @property
-    def all_special_ids(self):
-        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
-            class attributes (cls_token, unk_token...).
-        """
-        all_toks = self.all_special_tokens
-        all_ids = self.convert_tokens_to_ids(all_toks)
-        return all_ids
-
-class PretrainedTokenizer(SpecialTokenMixin):
-
-    vocab_files_names: Dict[str, str] = {}
-    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
-    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
-    max_model_input_sizes: Dict[str, int] = {}
-    model_input_names: List[str] = ["token_type_ids", "attention_mask"]
-
-    padding_side: str = "right"
-
-    NO_PAD_TOKEN_FOR_BATCH_MSG = (
-        "No padding token is set for this model, therefore no batch can be made with uneven "
-        "sequences. Set a padding token or adjust the lengths of the sequences building the "
-        "batch so that every sequence is of the same length."
-    )
-
-    UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
-        "The sequences building the batch are not of the same size, no tensor "
-        "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
-        "up to the larger sequence's length."
-    )
-
-    def __init__(self, model_max_length=None, **kwargs):
-        super(PretrainedTokenizer, self).__init__(**kwargs)
-
-        # For backward compatibility we fallback to set model_max_length from max_len if provided
-        model_max_length = model_max_length if model_max_length is not None else kwargs.pop("max_len", None)
-        self.model_max_length = model_max_length if model_max_length is not None else int(1e30)
-
-        # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
-        self.padding_side = kwargs.pop("padding_side", self.padding_side)
-        assert self.padding_side in [
-            "right",
-            "left",
-        ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
-        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
-
-        # Added tokens
-        self.added_tokens_encoder = {}
-        self.unique_added_tokens_encoder = set()
-        self.added_tokens_decoder = {}
-
-        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
-        self.init_inputs = ()
-        self.init_kwargs = {}
-
-    def __len__(self):
-        return self.vocab_size + len(self.added_tokens_encoder)
-
-    @property
-    def vocab_size(self):
-        raise NotImplementedError
-
-    @property
-    def max_len(self):
-        return self.model_max_length
-
-    @property
-    def max_len_single_sentence(self):
-        return self.model_max_length - self.num_special_tokens_to_add(pair=False)
-
-    @property
-    def max_len_sentences_pair(self):
-        return self.model_max_length - self.num_special_tokens_to_add(pair=True)
-
-    @classmethod
-    def from_pretrained(cls, *input, **kwargs):
-        return cls._from_pretrained(*input, **kwargs)
-
-    def _convert_token_to_id(self, token):
-    	return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-    	return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-    	out_string = " ".join(tokens).replace(" ##", "").strip()
-    	return out_string
-
-    def convert_tokens_to_ids(self, tokens):
-        if isinstance(tokens, str):
-            return self._convert_token_to_id(tokens)
-        ids = [self._convert_token_to_id(token) for token in tokens]
-        if len(ids) > self.max_len:
-            raise ValueError(
-            "Token indices sequence length is longer than the specified maximum "
-            " sequence length for this BERT model ({} > {}). Running this"
-            " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
-        )
-        return ids
-
-    def convert_ids_to_tokens(self, ids):
-        tokens = [self._convert_id_to_token(i) for i in ids]
-        return tokens
-
-    # ETRI 코드
-    @classmethod
-    def _from_pretrained(cls, pretrained_model_name, cache_dir=None, *init_inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name in cls.pretrained_vocab_files_map:
-            vocab_file = cls.pretrained_vocab_files_map[pretrained_model_name]
-        else:
-            vocab_file = pretrained_model_name
-        if os.path.isdir(vocab_file):
-            vocab_file = os.path.join(vocab_file, self.vocab_file_names)
-            # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-        except FileNotFoundError:
-            logger.error(
-                "Model name '{}' was not found in model name list ({}). "
-                "We assumed '{}' was a path or url but couldn't find any file "
-                "associated to this path or url.".format(
-                    pretrained_model_name,
-                    ', '.join(cls.pretrained_vocab_files_map.keys()),
-                    vocab_file))
-            return None
-        if resolved_vocab_file == vocab_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-            	vocab_file, resolved_vocab_file))
-        if pretrained_model_name in self.max_model_input_sizes:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = self.max_model_input_sizes[pretrained_model_name]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-            # Instantiate tokenizer.
-            tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
-            return tokenizer
-
-if __name__ == '__main__':
-    t = SpecialTokenMixin(bos_token='[UNK]')
-    print(t.bos_token)
diff --git a/tutorials/README.md b/tutorials/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/understand_wordpiece_tokenizing.ipynb b/understand_wordpiece_tokenizing.ipynb
deleted file mode 100644
index 54f71ab..0000000
--- a/understand_wordpiece_tokenizing.ipynb
+++ /dev/null
@@ -1,579 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "BERT_MODEL_HUB\t https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\n",
-      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n"
-     ]
-    }
-   ],
-   "source": [
-    "import tensorflow as tf\n",
-    "import tensorflow_hub as hub\n",
-    "\n",
-    "BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:\"string\"}\n",
-    "BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'\n",
-    "\n",
-    "print('BERT_MODEL_HUB\\t', BERT_MODEL_HUB)\n",
-    "\n",
-    "# Vocab_file을 저장하고 directory 주소를 binary 형태로 얻는다.\n",
-    "with tf.Graph().as_default():\n",
-    "    bert_module = hub.Module(BERT_MODEL_HUB)\n",
-    "    tokenization_info = bert_module(signature='tokenization_info',\n",
-    "                                    as_dict=True)\n",
-    "    with tf.Session() as sess:\n",
-    "        vocab_file, do_lower_case = sess.run(\n",
-    "            [tokenization_info['vocab_file'],\n",
-    "             tokenization_info['do_lower_case']])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "b'C:\\\\Users\\\\jinma\\\\AppData\\\\Local\\\\Temp\\\\tfhub_modules\\\\5a395eafef2a37bd9fc55d7f6ae676d2a134a838\\\\assets\\\\vocab.txt'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "vocab_file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import collections\n",
-    "\n",
-    "# 단어 사전을 저장할 Ordereddict 객체 생성\n",
-    "vocab = collections.OrderedDict()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Binary text를 unicode(utf-8)로 decode하는 함수 작성\n",
-    "def convert_to_unicode(text):\n",
-    "    if isinstance(text, str):\n",
-    "        return text\n",
-    "    elif isinstance(text, bytes):\n",
-    "        return text.decode('utf-8', 'ignore')\n",
-    "    else:\n",
-    "        raise ValueError('Unsupported string type: %s' % type(text))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# vocab_file의 각 text를 unicode로 변환, vocab에 기록\n",
-    "index = 0\n",
-    "with tf.gfile.GFile(vocab_file, 'r') as reader:\n",
-    "    while True:\n",
-    "        token = convert_to_unicode(reader.readline())\n",
-    "        if not token:\n",
-    "            break\n",
-    "        token = token.strip()\n",
-    "        vocab[token] = index\n",
-    "        index += 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(2023, 19204)"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "vocab.get('this'), vocab.get('token')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "30522"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(vocab)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['[PAD]',\n",
-       " '\"',\n",
-       " 'to',\n",
-       " 'paris',\n",
-       " 'tears',\n",
-       " 'knight',\n",
-       " 'peninsula',\n",
-       " 'licensed',\n",
-       " 'mouse',\n",
-       " 'screenplay',\n",
-       " 'raven',\n",
-       " 'tonnes',\n",
-       " 'princes',\n",
-       " 'osaka',\n",
-       " 'liability',\n",
-       " '##lip',\n",
-       " 'kappa',\n",
-       " 'hasan',\n",
-       " 'belts',\n",
-       " '##leader',\n",
-       " 'chunk',\n",
-       " 'colton',\n",
-       " 'artworks',\n",
-       " 'radiated',\n",
-       " 'plank',\n",
-       " 'fielder',\n",
-       " 'fide',\n",
-       " 'selector',\n",
-       " 'statehood',\n",
-       " 'gunners',\n",
-       " '##ᄌ']"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "list(vocab.keys())[::1000]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# vocab의 key와 value를 바꾼 dict 객체 생성\n",
-    "inv_vocab = {v:k for k, v in vocab.items()}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "do_lower_case = True"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Char 단위 함수 작성\n",
-    "import unicodedata\n",
-    "\n",
-    "def _is_whitespace(char):\n",
-    "    if char == \" \" or char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n",
-    "        # 공백 혹은 개행문자일 경우 True 반환\n",
-    "        return True\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat == 'Zs':\n",
-    "        # unicode category가 \"Space Separator\"일 경우 True 반환\n",
-    "        return True\n",
-    "    return False\n",
-    "    \n",
-    "def _is_control(char):\n",
-    "    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n",
-    "        # 개행문자일 경우 False 반환\n",
-    "        return False\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat in ('Cc', 'Cf'):\n",
-    "        # unicode category가 \"Control\", 혹은 \"Format\"일 경우 True 반환\n",
-    "        return True\n",
-    "    return False\n",
-    "\n",
-    "def _is_punctuation(char):\n",
-    "    cp = ord(char)\n",
-    "    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or\n",
-    "       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):\n",
-    "        return True\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat.startswith(\"P\"):\n",
-    "        # unicode category가 P로 시작할 경우 True 반환\n",
-    "        # Pc (Connector Punctuatoin)\n",
-    "        # Pd (Dash Punctuation)\n",
-    "        # Pe (Close Punctuation)\n",
-    "        # Pf (Final Punctuatoin)\n",
-    "        # Pi (Initial Punctuation)\n",
-    "        # Po (Other Punctuation)\n",
-    "        # Ps (Open Punctuation)\n",
-    "        return True\n",
-    "    return False"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "('\\x00', '�')"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "chr(0), chr(0xfffd)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "\"\\n This \\t here's \\t an example of using the BERT tokenizer\""
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# 예제 text를 할당한다.\n",
-    "text = \"\\n This \\t here's \\t an example of using the BERT tokenizer\"\n",
-    "text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "1. BasicTokenizer로 tokenize\n",
-      "\n",
-      "Origin Text  : This   here's   an example of using the BERT tokenizer\n",
-      "Cleaned Text : This   here's   an example of using the BERT tokenizer\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : this\n",
-      "\tnormalize Token : this\n",
-      "\t   output Token : this\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['t', 'h', 'i', 's']\n",
-      "\tEnd output : [['t', 'h', 'i', 's']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : here's\n",
-      "\tnormalize Token : here's\n",
-      "\t   output Token : here's\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['h', 'e', 'r', 'e', \"'\", 's']\n",
-      "\tEnd output : [['h', 'e', 'r', 'e'], [\"'\"], ['s']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : an\n",
-      "\tnormalize Token : an\n",
-      "\t   output Token : an\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['a', 'n']\n",
-      "\tEnd output : [['a', 'n']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : example\n",
-      "\tnormalize Token : example\n",
-      "\t   output Token : example\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['e', 'x', 'a', 'm', 'p', 'l', 'e']\n",
-      "\tEnd output : [['e', 'x', 'a', 'm', 'p', 'l', 'e']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : of\n",
-      "\tnormalize Token : of\n",
-      "\t   output Token : of\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['o', 'f']\n",
-      "\tEnd output : [['o', 'f']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : using\n",
-      "\tnormalize Token : using\n",
-      "\t   output Token : using\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['u', 's', 'i', 'n', 'g']\n",
-      "\tEnd output : [['u', 's', 'i', 'n', 'g']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : the\n",
-      "\tnormalize Token : the\n",
-      "\t   output Token : the\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['t', 'h', 'e']\n",
-      "\tEnd output : [['t', 'h', 'e']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : bert\n",
-      "\tnormalize Token : bert\n",
-      "\t   output Token : bert\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['b', 'e', 'r', 't']\n",
-      "\tEnd output : [['b', 'e', 'r', 't']]\n",
-      "  Do Lower Case... run strip accents.\n",
-      "\t   origin Token : tokenizer\n",
-      "\tnormalize Token : tokenizer\n",
-      "\t   output Token : tokenizer\n",
-      "  \t\trun split on Punctuation.\n",
-      "\tStart on list(token) : ['t', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']\n",
-      "\tEnd output : [['t', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']]\n",
-      "split_tokens : ['this', 'here', \"'\", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'tokenizer']\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Final Result : ['this', 'here', \"'\", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'tokenizer']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('1. BasicTokenizer로 tokenize\\n')\n",
-    "text = convert_to_unicode(text)\n",
-    "## _clean_text(self, text):\n",
-    "output = []\n",
-    "for char in text:\n",
-    "    cp = ord(char)\n",
-    "    if cp == 0 or cp == 0xfffd or _is_control(char):\n",
-    "        continue\n",
-    "    if _is_whitespace(char): # 공백 혹은 개행문자면 \n",
-    "        output.append(\" \")\n",
-    "    else:\n",
-    "        output.append(char)\n",
-    "print('Origin Text  :', text)\n",
-    "text = \"\".join(output)\n",
-    "print('Cleaned Text :', text)\n",
-    "\n",
-    "## whitespace_tokenize(text)\n",
-    "text = text.strip()\n",
-    "orig_tokens = text.split()\n",
-    "split_tokens = []\n",
-    "for token in orig_tokens:\n",
-    "    if do_lower_case:\n",
-    "        print('  Do Lower Case... run strip accents.')\n",
-    "        token = token.lower()\n",
-    "        ## _run_strip_accents(self, text)\n",
-    "        print('\\t   origin Token :', token)\n",
-    "        token = unicodedata.normalize(\"NFD\", token)\n",
-    "        print('\\tnormalize Token :', token)\n",
-    "        output = []\n",
-    "        for char in token:\n",
-    "            cat = unicodedata.category(char)\n",
-    "            if cat == 'Mn':\n",
-    "                # unicode category가 \"Nonspacing Mark\"일 경우 pass\n",
-    "                continue\n",
-    "            output.append(char)\n",
-    "        token = \"\".join(output)\n",
-    "        print('\\t   output Token :', token)\n",
-    "    ## _run_split_on_punc(self, text)\n",
-    "    print('  \\t\\trun split on Punctuation.')\n",
-    "    chars = list(token)\n",
-    "    i, start_new_word, output = 0, True, []\n",
-    "    print('\\tStart on list(token) :', chars)\n",
-    "    while i < len(chars):\n",
-    "        char = chars[i]\n",
-    "        if _is_punctuation(char):\n",
-    "            output.append([char])\n",
-    "            start_new_word = True\n",
-    "        else:\n",
-    "            if start_new_word:\n",
-    "                output.append([])\n",
-    "            start_new_word = False\n",
-    "            output[-1].append(char)\n",
-    "        i += 1\n",
-    "    print('\\tEnd output :', output)\n",
-    "    split_tokens.extend([\"\".join(x) for x in output])\n",
-    "print('split_tokens :', split_tokens)\n",
-    "t = \" \".join(split_tokens)\n",
-    "t = t.strip()\n",
-    "output_tokens = t.split()\n",
-    "print('-' * 100 + '\\n' + 'Final Result :', output_tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "2. WordpieceTokenizer로 tokenize\n",
-      "\n",
-      "token : this\n",
-      "0 4\t this\n",
-      "token : here\n",
-      "0 4\t here\n",
-      "token : '\n",
-      "0 1\t '\n",
-      "token : s\n",
-      "0 1\t s\n",
-      "token : an\n",
-      "0 2\t an\n",
-      "token : example\n",
-      "0 7\t example\n",
-      "token : of\n",
-      "0 2\t of\n",
-      "token : using\n",
-      "0 5\t using\n",
-      "token : the\n",
-      "0 3\t the\n",
-      "token : bert\n",
-      "0 4\t bert\n",
-      "token : tokenizer\n",
-      "0 9\t tokenizer\n",
-      "\t tokenize\n",
-      "\t tokeniz\n",
-      "\t tokeni\n",
-      "\t token\n",
-      "5 9\t izer\n",
-      "----------------------------------------------------------------------------------------------------\n",
-      "Final Result : ['this', 'here', \"'\", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'token', '##izer']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('2. WordpieceTokenizer로 tokenize\\n')\n",
-    "\n",
-    "split_tokens = []\n",
-    "for token in output_tokens:\n",
-    "    print('token :', token)\n",
-    "    ## wordpiece tokenizing (greedy longest-match-first algorithm)\n",
-    "    unk_token = \"[UNK]\"\n",
-    "    max_input_chars_per_word = 200\n",
-    "    # Start\n",
-    "    token = convert_to_unicode(token)\n",
-    "    output_tokens_ = []\n",
-    "    ## whitspacing\n",
-    "    if not token.strip():\n",
-    "        tokens = []\n",
-    "    else:\n",
-    "        tokens = token.strip().split()\n",
-    "    for token in tokens:\n",
-    "        chars = list(token)\n",
-    "        if len(chars) > max_input_chars_per_word:\n",
-    "            # 200글자를 넘을 경우 UNK 처리\n",
-    "            output_tokens_.append(unk_token)\n",
-    "            continue\n",
-    "        \n",
-    "        is_bad = False\n",
-    "        start = 0\n",
-    "        sub_tokens = []\n",
-    "        while start < len(chars):\n",
-    "            end = len(chars)\n",
-    "            print(start, end, end='')\n",
-    "            cur_substr = None\n",
-    "            # 첫번째 글짜부터 천천히 vocab에 있는 단어인지 체크\n",
-    "            while start < end:\n",
-    "                substr = \"\".join(chars[start:end])\n",
-    "                print('\\t', substr)\n",
-    "                if start > 0:\n",
-    "                    ## start에 end가 할당됐을 경우,\n",
-    "                    ## 이는 어미이므로 ##을 붙여서 vocab에 있는지 체크\n",
-    "                    substr = \"##\" + substr\n",
-    "                if substr in vocab:\n",
-    "                    cur_substr = substr\n",
-    "                    break\n",
-    "                end -= 1\n",
-    "            # 만일 못찾았을 경우, [UNK]으로 처리\n",
-    "            if cur_substr is None:\n",
-    "                is_bad = True\n",
-    "                break\n",
-    "            sub_tokens.append(cur_substr)\n",
-    "            # 어미를 추가하기 위해 start에 end값을 할당\n",
-    "            start = end\n",
-    "        if is_bad:\n",
-    "            output_tokens_.append(unk_token)\n",
-    "        else:\n",
-    "            output_tokens_.extend(sub_tokens)\n",
-    "    for sub_token in sub_tokens:\n",
-    "        split_tokens.append(sub_token)\n",
-    "        \n",
-    "print('-' * 100 + '\\n' + 'Final Result :', split_tokens)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "basic",
-   "language": "python",
-   "name": "basic"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git "a/\355\230\225\355\203\234\354\206\214 \353\266\204\354\204\235+\355\206\240\355\201\260\355\231\224+PositionalEmbedding.ipynb" "b/\355\230\225\355\203\234\354\206\214 \353\266\204\354\204\235+\355\206\240\355\201\260\355\231\224+PositionalEmbedding.ipynb"
deleted file mode 100644
index cc24b41..0000000
--- "a/\355\230\225\355\203\234\354\206\214 \353\266\204\354\204\235+\355\206\240\355\201\260\355\231\224+PositionalEmbedding.ipynb"	
+++ /dev/null
@@ -1,1597 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# BERT를 위한 형태소 분석\n",
-    "- google research의 tensorflow 버전(원본) bert\n",
-    "- ETRI에서 제공한 pre-trained 모델과 한국어 단어 사전 사용\n",
-    "- ETRI에서 제공하는 버전은 총 4개\n",
-    "    - Pytorch + Morphology\n",
-    "    - Tensorflow + Morphology\n",
-    "    - Pytorch + Eojeol\n",
-    "    - Tensorflow + Eojeol\n",
-    "- 이 중 `Pytorch`제외, 형태소 분석이 된 text를 input으로 받는 2번 선택\n",
-    "- `Morphology`는 input text에 **TTA 표준 형태소 태그셋(TTAK.KO-11.0010/R1)**에 맞는 **형태소 분석기**를 사용해야 함.\n",
-    "- TTA 표준 형태소 태그셋에 맞게 분석하는 형태소 분석기는 다음과 같음\n",
-    "    - `Mecab`\n",
-    "        - `심사숙고/NNG + 했/XSV+EP + 겠/EP + 지만/EC`와 같이 분석하는 경우가 있음\n",
-    "        - Input이 `XSV+EP`으로 나오면 안됨. + 제거 후 `[하/동사파생접미사(XSV), 였/선어말어미(EP)]`로 분석해야함\n",
-    "    - `ETRI 형태소 분석기`\n",
-    "        - Web API에 접속하여 사용\n",
-    "        - 일일 한도 제한있음\n",
-    "    - `Khaiii`\n",
-    "        - Kakao Hangul Analyzer III\n",
-    "        - 속도도 빠르고 만족스러운 성능을 보임\n",
-    "        - 윈도우 지원 안함\n",
-    "        - `심사숙고/NNG + 하/XSV + 였/EP + 겠/EP + 지만/EC`와 같이 아주 잘 분석함\n",
-    "- 아래 표는 `TTAK.KO-11.0010/R1`\n",
-    "\n",
-    "    <table class=\"table table-striped table-bordered\" style=\"width:450px;\">\n",
-    "      <thead>\n",
-    "        <tr>\n",
-    "          <th style=\"width:100px\" align=\"center\">대분류</td>\n",
-    "          <th style=\"width:150px\" align=\"center\">중분류</td>\n",
-    "          <th style=\"width:200px\" align=\"center\">대분류</td>\n",
-    "        </tr>\n",
-    "      </thead>\n",
-    "      <tbody>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"5\" align=\"center\">(1) 체언</td>\n",
-    "          <td rowspan=\"3\" align=\"center\">명사</td>\n",
-    "          <td align=\"center\">일반명사(NNG)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">고유명사(NNP)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">의존명사(NNB)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">대명사(NP)</td>\n",
-    "          <td align=\"center\">대명사(NP)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">수사(NR)</td>\n",
-    "          <td align=\"center\">수사(NR)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"5\" align=\"center\">(2) 용언</td>\n",
-    "          <td align=\"center\">동사(VV)</td>\n",
-    "          <td align=\"center\">동사(VV)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">형용사(VA)</td>\n",
-    "          <td align=\"center\">형용사(VA)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">보조용언(VX)</td>\n",
-    "          <td align=\"center\">보조용언(VX)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"2\" align=\"center\">지정사(VC)</td>\n",
-    "          <td align=\"center\">긍정지정사(VCP)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">부정지정사(VCN)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"5\" align=\"center\">(3) 수식언</td>\n",
-    "          <td rowspan=\"3\" align=\"center\">관형사(MM)</td>\n",
-    "          <td align=\"center\">성상 관형사(MMA)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">지시 관형사(MMD)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">수 관형사(MMN)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"2\" align=\"center\">부사(MA)</td>\n",
-    "          <td align=\"center\">일반부사(MAG)</td>        \n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">접속부사(MAJ)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">(4) 독립언</td>\n",
-    "          <td align=\"center\">감탄사(IC)</td>\n",
-    "          <td align=\"center\">감탄사(IC)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"9\" align=\"center\">(5) 관계언</td>\n",
-    "          <td rowspan=\"7\" align=\"center\">격조사(JK)</td>\n",
-    "          <td align=\"center\">주격조사(JKS)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">보격조사(JKC)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">관형격조사(JKG)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">목적격조사(JKO)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">부사격조사(JKB)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">호격조사(JKV)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">인용격조사(JKQ)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">보조사(JX)</td>\n",
-    "          <td align=\"center\">보조사(JX)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">접속조사(JC)</td>\n",
-    "          <td align=\"center\">접속조사(JC)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"10\" align=\"center\">(6) 의존형태</td>\n",
-    "          <td rowspan=\"5\" align=\"center\">어미(EM)</td>\n",
-    "          <td align=\"center\">선어말어미(EP)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">종결어미(EF)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">연결어미(EC)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">명사형전성어미(ETN)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">관형형전성어미(ETM)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">접두사(XP)</td>\n",
-    "          <td align=\"center\">체언접두사(XPN)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"3\" align=\"center\">접미사(XS)</td>\n",
-    "          <td align=\"center\">명사파생접미사(XSN)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">동사파생접미사(XSV)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">형용사파생접미사(XSA)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">어근(XR)</td>\n",
-    "          <td align=\"center\">어근(XR)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td rowspan=\"10\" align=\"center\">(7) 기초</td>\n",
-    "          <td rowspan=\"6\" align=\"center\">일반기호(ST)</td>\n",
-    "          <td align=\"center\">마침표, 물음표, 느낌표(SF)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">쉼표, 가운뎃점, 콜론, 빗금(SP)</td>\n",
-    "        </tr>  \n",
-    "        <tr>\n",
-    "          <td align=\"center\">따옴표, 괄호표, 줄표(SS)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">줄임표(SE)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">붙임표(물결)(SO)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">기타 기호(SW)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">외국어(SL)</td>\n",
-    "          <td align=\"center\">외국어(SL)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">한자(SH)</td>\n",
-    "          <td align=\"center\">한자(SH)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">숫자(SN)</td>\n",
-    "          <td align=\"center\">숫자(SN)</td>\n",
-    "        </tr>\n",
-    "        <tr>\n",
-    "          <td align=\"center\">분석불능범주(NA)</td>\n",
-    "          <td align=\"center\">분석불능범주(NA)</td>\n",
-    "        </tr>\n",
-    "      </tdoby>\n",
-    "    </table>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## BERTTokenizer\n",
-    "- End 2 End Tokenizer\n",
-    "- `BasicTokenizer`와 `WordpieceTokenizer`를 연결하여 한번에 Tokenizing시킨다\n",
-    "- input은 위에서 언급했듯이 TTAK 기준으로 형태소 분석된 텍스트를 넣어줘야하며\n",
-    "- 이를 Token화 시키는 것이 해당 Tokenizer의 역할\n",
-    "- 사실상 하는 역할이 없다. 아래 어떻게 작동되는지 보면 안다."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## convert_single_example\n",
-    "- BERT 논문에 나오는 positional embedding을 실시\n",
-    "- 내부적으로 `BERTTokenizer`의 input으로 들어갈 수 있게 형태소 분석을 해주고\n",
-    "- Token화된 input을 positional embedding시켜 feature화시키고 이를 반환한다"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## file_based_convert_examples_to_features\n",
-    "- 들어오는 input을 `convert_single_example`함수에 넣어 positional embedding된 feature로 받고\n",
-    "- 이를 `tf_record`파일로 기록한다."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Gocha!\n",
-    "- 어떻게 Tokenizing하는지 들여다보자."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### 사전 정의\n",
-    "- `BERTTokenizer`의 생성자를 보면 아래 항목을 argument로 받고\n",
-    "    - vocab_file: 단어 사전이 저장된 file path\n",
-    "    - do_lower_case: 원래는 소문자 변환을 할 것인지, 한국어에서는 정준분해를 할 것인지\n",
-    "    - max_len: 최대 길이\n",
-    "- 아래 속성을 정의한다.\n",
-    "    - 사전\n",
-    "    - 역방향 사전\n",
-    "    - `BasicTokenizer`\n",
-    "    - `WordpieceTokenizer`\n",
-    "    - 최대 길이\n",
-    "- 하나씩 보자."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# LIBRARIES\n",
-    "\n",
-    "import collections       # OrderedDict를 위해 호출\n",
-    "import re                # 정규표현식\n",
-    "import unicodedata       # 한국어 정준분해 및 문자열 확인\n",
-    "import six               # Python version 체크\n",
-    "import tensorflow as tf  # Tensorflow 파일 불러오기 및 logging"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ARGUMENTS\n",
-    "\n",
-    "# ETRI에서 받은 file path를 저장\n",
-    "path2 = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'\n",
-    "path4 = '../KorBERT/4_bert_download_004_bert_eojeol_tensorflow/004_bert_eojeol_tensorflow/'\n",
-    "# 한국어 vocab 사전을 등록\n",
-    "morph_vocab_file = path2 + 'vocab.korean_morp.list'\n",
-    "rawtext_vocab_file = path4 + 'vocab.korean.rawtext.list'\n",
-    "\n",
-    "do_lower_case = True # default=False, 정준분해 예시를 위해 True로 설정\n",
-    "max_len = None # 없으면 1e12"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 단어 사전 호출\n",
-    "\n",
-    "def convert_to_unicode(text):\n",
-    "    # Python version이 3.x일 때,\n",
-    "    # type(text)이 `bytes`일 경우, utf-8로 변환\n",
-    "    if six.PY3:\n",
-    "        if isinstance(text, str):\n",
-    "            return text\n",
-    "        elif isinstance(text, bytes):\n",
-    "            return text.decode(\"utf-8\", \"ignore\")\n",
-    "        else:\n",
-    "            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n",
-    "    # Python version이 2.x일 때,\n",
-    "    # type(text)이 `str`일 경우, utf-8로 변환\n",
-    "    elif six.PY2:\n",
-    "        if isinstance(text, str):\n",
-    "            return text.decode(\"utf-8\", \"ignore\")\n",
-    "        elif isinstance(text, unicode):\n",
-    "            return text\n",
-    "        else:\n",
-    "            raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n",
-    "    # Python 3.x, 2.x만 허용!\n",
-    "    else:\n",
-    "        raise ValueError(\"Not running on Python2 or Python 3?\")\n",
-    "        \n",
-    "        \n",
-    "def _load_vocab(vocab_file):\n",
-    "    # 단어 사전을 저장할 OrderedDict 객체 생성\n",
-    "    vocab = collections.OrderedDict()\n",
-    "    index = 0\n",
-    "    with tf.io.gfile.GFile(vocab_file, 'r') as reader:\n",
-    "        while True:\n",
-    "            # Binary Text를 unicode(utf-8)로 decode.\n",
-    "            token = convert_to_unicode(reader.readline())\n",
-    "            if not token: break\n",
-    "            if ((token.find('n_iters=') == 0) or\n",
-    "                (token.find('max_length=') == 0)):\n",
-    "                continue\n",
-    "            token = token.split('\\t')[0]\n",
-    "            token = token.strip()\n",
-    "            # 토큰과 해당 index를 기록\n",
-    "            vocab[token] = index\n",
-    "            index += 1\n",
-    "    return vocab\n",
-    "\n",
-    "# 단어 사전 호출\n",
-    "morph_vocab = _load_vocab(morph_vocab_file)\n",
-    "rawtext_vocab = _load_vocab(rawtext_vocab_file)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'[PAD]': 0,\n",
-       " '[UNK]': 1,\n",
-       " '[CLS]': 2,\n",
-       " '[SEP]': 3,\n",
-       " '[MASK]': 4,\n",
-       " '<S>': 5,\n",
-       " '<T>': 6,\n",
-       " './SF_': 7,\n",
-       " '다/EF_': 8,\n",
-       " '하/XSV_': 9,\n",
-       " 'ㄴ/ETM_': 10,\n",
-       " '을/JKO_': 11,\n",
-       " '었/EP_': 12,\n",
-       " '의/JKG_': 13,\n",
-       " '에/JKB_': 14,\n",
-       " '이/VCP_': 15,\n",
-       " '이/JKS_': 16,\n",
-       " ',/SP_': 17,\n",
-       " '는/JX_': 18,\n",
-       " '를/JKO_': 19,\n",
-       " '어/EC_': 20,\n",
-       " '은/JX_': 21,\n",
-       " '는/ETM_': 22,\n",
-       " '고/EC_': 23,\n",
-       " '가/JKS_': 24,\n",
-       " '\"/SS_': 25,\n",
-       " \"'/SS_\": 26,\n",
-       " '에서/JKB_': 27,\n",
-       " '으로/JKB_': 28,\n",
-       " '(/SS_': 29,\n",
-       " ')/SS_': 30,\n",
-       " '로/JKB_': 31,\n",
-       " '되/XSV_': 32,\n",
-       " '것/NNB_': 33,\n",
-       " '도/JX_': 34,\n",
-       " 'ㄹ/ETM_': 35,\n",
-       " '들/XSN_': 36,\n",
-       " '있/VX_': 37,\n",
-       " '있/VA_': 38,\n",
-       " '년/NNB_': 39,\n",
-       " '하/VV_': 40,\n",
-       " 'ㄴ다/EF_': 41,\n",
-       " '하/XSA_': 42,\n",
-       " '았/EP_': 43,\n",
-       " '일/NNB_': 44,\n",
-       " '은/ETM_': 45,\n",
-       " '과/JC_': 46,\n",
-       " '게/EC_': 47,\n",
-       " '지/EC_': 48,\n",
-       " '기/ETN_': 49,\n",
-       " '1/SN_': 50,\n",
-       " '등/NNB_': 51,\n",
-       " '자/XSN_': 52,\n",
-       " '며/EC_': 53,\n",
-       " '2/SN_': 54,\n",
-       " '수/NNB_': 55,\n",
-       " '와/JC_': 56,\n",
-       " '되/VV_': 57,\n",
-       " '적/XSN_': 58,\n",
-       " '않/VX_': 59,\n",
-       " '월/NNB_': 60,\n",
-       " '하/VX_': 61,\n",
-       " '아/EC_': 62,\n",
-       " '3/SN_': 63,\n",
-       " '고/JKQ_': 64,\n",
-       " '‘/SS_': 65,\n",
-       " '’/SS_': 66,\n",
-       " '“/SS_': 67,\n",
-       " '던/ETM_': 68,\n",
-       " '”/SS_': 69,\n",
-       " '없/VA_': 70,\n",
-       " '면/EC_': 71,\n",
-       " '말/NNG_': 72,\n",
-       " '대하/VV_': 73,\n",
-       " '지만/EC_': 74,\n",
-       " '·/SP_': 75,\n",
-       " '에게/JKB_': 76,\n",
-       " '이/NP_': 77,\n",
-       " '받/VV_': 78,\n",
-       " '까지/JX_': 79,\n",
-       " '이/MM_': 80,\n",
-       " '%/SW_': 81,\n",
-       " '4/SN_': 82,\n",
-       " '/NNG_': 83,\n",
-       " '과/JKB_': 84,\n",
-       " '만/NR_': 85,\n",
-       " '원/NNB_': 86,\n",
-       " '명/NNB_': 87,\n",
-       " '면서/EC_': 88,\n",
-       " '다는/ETM_': 89,\n",
-       " '그/NP_': 90,\n",
-       " '5/SN_': 91,\n",
-       " '한/MM_': 92,\n",
-       " '을/ETM_': 93,\n",
-       " '어서/EC_': 94,\n",
-       " '-/SS_': 95,\n",
-       " '다고/EC_': 96,\n",
-       " '위하/VV_': 97,\n",
-       " '만/JX_': 98,\n",
-       " '중/NNB_': 99}"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# 100개만 출력해보기\n",
-    "# 뒤의 품사에 `_`가 붙은 것을 잘 기억해두기\n",
-    "{key:value for i, (key, value) in enumerate(morph_vocab.items()) if i < 100}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'[PAD]': 0,\n",
-       " '[UNK]': 1,\n",
-       " '[CLS]': 2,\n",
-       " '[SEP]': 3,\n",
-       " '[MASK]': 4,\n",
-       " '<S>': 5,\n",
-       " '<T>': 6,\n",
-       " '._': 7,\n",
-       " ',_': 8,\n",
-       " '_': 9,\n",
-       " '이_': 10,\n",
-       " '의_': 11,\n",
-       " '을_': 12,\n",
-       " '에_': 13,\n",
-       " '\"': 14,\n",
-       " '(': 15,\n",
-       " '한_': 16,\n",
-       " \"'\": 17,\n",
-       " '은_': 18,\n",
-       " ')': 19,\n",
-       " '이': 20,\n",
-       " '는_': 21,\n",
-       " '에서_': 22,\n",
-       " '고_': 23,\n",
-       " '했다': 24,\n",
-       " '가_': 25,\n",
-       " '로_': 26,\n",
-       " '지': 27,\n",
-       " '있다': 28,\n",
-       " '도_': 29,\n",
-       " '과_': 30,\n",
-       " '으로_': 31,\n",
-       " '를_': 32,\n",
-       " '다': 33,\n",
-       " '하는_': 34,\n",
-       " '사': 35,\n",
-       " '시': 36,\n",
-       " '기': 37,\n",
-       " '대': 38,\n",
-       " '고': 39,\n",
-       " '수': 40,\n",
-       " '가': 41,\n",
-       " '.': 42,\n",
-       " '수_': 43,\n",
-       " '전': 44,\n",
-       " '주': 45,\n",
-       " '일': 46,\n",
-       " '리': 47,\n",
-       " '자': 48,\n",
-       " '정': 49,\n",
-       " '할_': 50,\n",
-       " '인': 51,\n",
-       " '1': 52,\n",
-       " '아': 53,\n",
-       " '와_': 54,\n",
-       " '부': 55,\n",
-       " '스': 56,\n",
-       " '인_': 57,\n",
-       " '하고_': 58,\n",
-       " '해': 59,\n",
-       " '보': 60,\n",
-       " '유': 61,\n",
-       " '어': 62,\n",
-       " '이다': 63,\n",
-       " '상': 64,\n",
-       " '2': 65,\n",
-       " ')_': 66,\n",
-       " '신': 67,\n",
-       " '원': 68,\n",
-       " '무': 69,\n",
-       " '장': 70,\n",
-       " '3': 71,\n",
-       " '마': 72,\n",
-       " '비': 73,\n",
-       " '조': 74,\n",
-       " '동': 75,\n",
-       " '제': 76,\n",
-       " '로': 77,\n",
-       " '해_': 78,\n",
-       " '소': 79,\n",
-       " '성': 80,\n",
-       " '도': 81,\n",
-       " '지_': 82,\n",
-       " '세': 83,\n",
-       " '‘': 84,\n",
-       " '나': 85,\n",
-       " '오': 86,\n",
-       " '미': 87,\n",
-       " '“': 88,\n",
-       " '공': 89,\n",
-       " '하': 90,\n",
-       " '연': 91,\n",
-       " '있는_': 92,\n",
-       " '구': 93,\n",
-       " '라': 94,\n",
-       " '재': 95,\n",
-       " '한': 96,\n",
-       " '여': 97,\n",
-       " '5': 98,\n",
-       " '4': 99}"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# 100개만 출력해보기\n",
-    "# 뒤의 품사에 `_`가 붙은 것을 잘 기억해두기\n",
-    "{key:value for i, (key, value) in enumerate(rawtext_vocab.items()) if i < 100}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(30797, 30349)"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(rawtext_vocab), len(morph_vocab)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'vocab' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-7-248586d9b49b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# 역방향 사전 정의\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# 근데 결국 사용안하드라\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0minv_vocab\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mk\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvocab\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;31mNameError\u001b[0m: name 'vocab' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "# 역방향 사전 정의\n",
-    "# 근데 결국 사용안하드라\n",
-    "inv_vocab = {v: k for k, v in vocab.items()}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "1000000000000"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# max_len을 사용자가 넣지 않았을 경우\n",
-    "# 1000000000000을 상한으로 함\n",
-    "# 사실상 무한대지 뭐.\n",
-    "max_len = max_len if max_len is not None else int(1e12)\n",
-    "max_len"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'심사숙고/NNG 하/XSV 였/EP 겠/EP 지만/EC 참으로/MAG 유감/NNG 이/JX 야/EC'"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Input Text\n",
-    "text = '심사숙고했겠지만 참으로 유감이야' # 예시 text 생성\n",
-    "# 형태소 분석을 아래와 같이 실시했다고 가정하자.\n",
-    "# 형태소 분석 API부분은 `convert_single_example`함수에서 다시 다룰게요.\n",
-    "text = '심사숙고/NNG + 하/XSV + 였/EP + 겠/EP + 지만/EC + 참으로/MAG + 유감/NNG + 이/JX + 야/EC'\n",
-    "# ETRI에서 정의한 대로 input을 만들어줍시다.\n",
-    "text = text.replace(' + ', ' ')\n",
-    "text"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### `BERTTokenizer.tokenize()`\n",
-    "- 아래와 같이 동작한다.\n",
-    "    ```python\n",
-    "    def tokenize(self, text):\n",
-    "        split_tokens = []\n",
-    "        # End to End Tokenizing.\n",
-    "        for token in self.basic_tokenizer.tokenize(text):\n",
-    "            # ETRI Vocab 양식에 맞게 token 끝에 '_'를 붙여준다.\n",
-    "            token += '_'\n",
-    "            for sub_token in self.wordpiece_tokenizer.tokenize(token):\n",
-    "                split_tokens.append(sub_token)\n",
-    "        return split_tokens\n",
-    "    ```\n",
-    "- 여기서 `BasicTokenizer`와 `WordpieceTokenizer`를 정의하지 않고 어떻게 동작하는지 그 흐름대로 살펴보겠다."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "SPLIT_TOKENS = [] # 최종적으로 return할 list 생성"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### `BasicTokenizer.tokenize()`\n",
-    "- 아래와 같이 동작한다.\n",
-    "    ```python\n",
-    "    def tokenize(self, text):\n",
-    "        text = convert_to_unicode(text) #1\n",
-    "        text = self._clean_text(text)   #2\n",
-    "\n",
-    "        orig_tokens = whitespace_tokenize(text) #3\n",
-    "        split_tokens = []\n",
-    "        for token in orig_tokens:\n",
-    "            if self.do_lower_case:\n",
-    "                # 현재 input으로 '고객/NNG'와 같이 Part-of-speech가 이미\n",
-    "                # tagging되어있고 vocab은 '고객/NNG_'로 단어를 기록하고 있음.\n",
-    "                # 여기서 `lower` 메서드를 사용하면 뒤의 tagging이 소문자로\n",
-    "                # 변환되어 값의 비교를 못하게 되므로 이를 주석처리.\n",
-    "\n",
-    "                # token.lower()\n",
-    "\n",
-    "                # 모든 음절을 정준 분해시키는 함수\n",
-    "                token = self._run_strip_accents(token) #4\n",
-    "            split_tokens.extend(self._run_split_on_punc(token)) #5\n",
-    "        output_tokens = whitespace_tokenize(\" \".join(split_tokens)) #6\n",
-    "        return output_tokens\n",
-    "    ```\n",
-    "- 순서대로 보겠다."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'심사숙고/NNG 하/XSV 였/EP 겠/EP 지만/EC 참으로/MAG 유감/NNG 이/JX 야/EC'"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#1. unicode 변환\n",
-    "text = convert_to_unicode(text)\n",
-    "text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## Tokenize하면서 계속 사용된 character 단위 함수 정의\n",
-    "def _is_control(char):\n",
-    "    if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n",
-    "        # 개행문자이면 False 반환\n",
-    "        return False\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat.startswith(\"C\"):\n",
-    "        # unicode category가\n",
-    "        # Cc(Control)\n",
-    "        # Cf(format)\n",
-    "        # Co(Private Use, is 0)\n",
-    "        # Cs(Surrrogate, is 0)일 경우, True 반환\n",
-    "        # https://en.wikipedia.org/wiki/Control_character\n",
-    "        return True\n",
-    "    # 이 외의 경우 전부 False 반환\n",
-    "    return False\n",
-    "    \n",
-    "def _is_whitespace(char):\n",
-    "    if char == \" \" or char == '\\t' or char == '\\n' or char == '\\r':\n",
-    "        # 개행문자이거나 띄어쓰기면 True 반환\n",
-    "        return True\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat == 'Zs':\n",
-    "        # unicode category가 Space Seperator면 True 반환\n",
-    "        # https://www.compart.com/en/unicode/category/Zs\n",
-    "        return True\n",
-    "    # 이 외의 경우 전부 False 반환\n",
-    "    return False\n",
-    "\n",
-    "def _is_punctuation(char):\n",
-    "    # 한국어 형태소 분석기이기 때문에 공백과 같은지 여부만 반환\n",
-    "    return char == ' '"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'심사숙고/NNG 하/XSV 였/EP 겠/EP 지만/EC 참으로/MAG 유감/NNG 이/JX 야/EC'"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#2. text cleaning\n",
-    "def _clean_text(text):\n",
-    "    output = [] # char을 저장할 list 생성\n",
-    "    for char in text:\n",
-    "        # 텍스트에서 Char 단위로 출력\n",
-    "        cp = ord(char)\n",
-    "        if cp == 0 or cp == 0xfffd or _is_control(char):\n",
-    "            # \\x00이거나 �이거나 unicode cat.이 C로 시작할 경우\n",
-    "            # (개행문자 제외) output에 추가하지 않는다.\n",
-    "            continue\n",
-    "        if _is_whitespace(char):\n",
-    "            # 공백일 경우 \" \"으로 output에 추가\n",
-    "            output.append(\" \")\n",
-    "        else:\n",
-    "            # 이 외의 경우 전부 output에 추가\n",
-    "            output.append(char)\n",
-    "    # cleaning 작업을 거친 Text를 후처리하여 반환\n",
-    "    return \"\".join(output)\n",
-    "\n",
-    "_clean_text(text) # 뭐가 변했을까? 안변한거 같지>?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'심사숙고했겠지만 참으로 유감이야'"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# 더러운 text가 있다고 생각해봐.\n",
-    "dirty_text = '심사\\x00숙고했겠�지만 참으로 유감이야'\n",
-    "_clean_text(dirty_text) # 이렇게 바꿔준다.\n",
-    "                  # 영어 BERT에는 중국어 변환, 기타 unicode도 신경쓰는데\n",
-    "                  # ETRI에서 이렇게 수정해서 코드를 배포했으니 잘 사용하도록 하자!\n",
-    "                  # 아니 근데 생각해보니까 이거 형태소 분석 전에 실시해야하는거 아니야?\n",
-    "                  # 코드 다시 짜는거 생각해보자"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['심사숙고/NNG',\n",
-       " '하/XSV',\n",
-       " '였/EP',\n",
-       " '겠/EP',\n",
-       " '지만/EC',\n",
-       " '참으로/MAG',\n",
-       " '유감/NNG',\n",
-       " '이/JX',\n",
-       " '야/EC']"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#3. whitespacing(띄어쓰기로 token화)\n",
-    "# text 단위 공백 처리\n",
-    "def whitespace_tokenize(text):\n",
-    "    \"\"\"Runs basic whitespace cleaning and splitting on a piece of text.\"\"\"\n",
-    "    text = text.strip() # 양 사이드의 공백을 제거\n",
-    "    if not text: # 어떠한 값도 없을 시, 빈 list를 반환\n",
-    "        return []\n",
-    "    tokens = text.split() # 공백 단위로 쪼갠 list를 반환\n",
-    "    return tokens\n",
-    "\n",
-    "orig_tokens = whitespace_tokenize(text)\n",
-    "orig_tokens"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'심사숙고/NNG'"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "split_tokens = []\n",
-    "# >>> 첫 번째 for loop\n",
-    "token = orig_tokens[0]\n",
-    "token # for loop 돌리기 전에 어떻게 돌아가는지 체크\n",
-    "# token = '심사 숙고'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "심사숙고/NNG >> ['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n",
-      "print(token) == 심사숙고/NNG (사실 출력시에는 변화 X)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'심사숙고/NNG'"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#4. 음절을 정준분해\n",
-    "print(token, end=' >> ')\n",
-    "token = unicodedata.normalize(\"NFD\", token)\n",
-    "print(list(token))\n",
-    "print('print(token) ==', token, '(사실 출력시에는 변화 X)')\n",
-    "# https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d\n",
-    "# 모든 음절을 정준 분해(Canonical Decomposition)시킴\n",
-    "# '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일)\n",
-    "output = []\n",
-    "for char in token:\n",
-    "    cat = unicodedata.category(char)\n",
-    "    if cat == \"Mn\":\n",
-    "        # unicode category가 \"Mark, Nonspacing\"일 경우 pass\n",
-    "        continue\n",
-    "    output.append(char)\n",
-    "token = ''.join(output)\n",
-    "token # if문에 해당하는 char가 없었기에 원본 text를 출력\n",
-    "      # 정준분해된 상태임을 기억해라"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n"
-     ]
-    }
-   ],
-   "source": [
-    "#5. punctuation 구분(사실상 의미가 없다)\n",
-    "chars = list(token)\n",
-    "i, start_new_word = 0, True\n",
-    "output = []\n",
-    "print(chars)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ']]\n",
-      "ᅵ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ']]\n",
-      "ᆷ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ']]\n",
-      "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ']]\n",
-      "ᅡ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ']]\n",
-      "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ']]\n",
-      "ᅮ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ']]\n",
-      "ᆨ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ']]\n",
-      "ᄀ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ']]\n",
-      "ᅩ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/']]\n",
-      "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N']]\n",
-      "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N']]\n",
-      "G\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "['심사숙고/NNG']"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print('char\\t_is_puntuation(char)\\tstart_new_word\\tOutput')\n",
-    "while i < len(chars):\n",
-    "    char = chars[i]\n",
-    "    print(char, end='\\t')\n",
-    "    print(_is_punctuation(char), end='\\t\\t\\t')\n",
-    "    if _is_punctuation(char):\n",
-    "        print('In Here!! <CODE BLOCK \"IF _IS_PUNCTUATOIN\">')\n",
-    "        output.append([char])\n",
-    "        start_new_word = True\n",
-    "    else:\n",
-    "        if start_new_word:\n",
-    "            output.append([])\n",
-    "        start_new_word = False\n",
-    "        output[-1].append(char)\n",
-    "    print(start_new_word, end='\\t\\t')\n",
-    "    print(output)\n",
-    "    i += 1\n",
-    "[\"\".join(x) for x in output]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "심사숙고/NNG >> ['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n",
-      "print(token) == 심사숙고/NNG (사실 출력시에는 변화 X)\n",
-      "['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ']]\n",
-      "ᅵ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ']]\n",
-      "ᆷ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ']]\n",
-      "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ']]\n",
-      "ᅡ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ']]\n",
-      "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ']]\n",
-      "ᅮ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ']]\n",
-      "ᆨ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ']]\n",
-      "ᄀ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ']]\n",
-      "ᅩ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/']]\n",
-      "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N']]\n",
-      "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N']]\n",
-      "G\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']]\n",
-      "하/XSV >> ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n",
-      "print(token) == 하/XSV (사실 출력시에는 변화 X)\n",
-      "['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄒ\tFalse\t\t\tFalse\t\t[['ᄒ']]\n",
-      "ᅡ\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/']]\n",
-      "X\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/', 'X']]\n",
-      "S\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/', 'X', 'S']]\n",
-      "V\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']]\n",
-      "였/EP >> ['ᄋ', 'ᅧ', 'ᆻ', '/', 'E', 'P']\n",
-      "print(token) == 였/EP (사실 출력시에는 변화 X)\n",
-      "['ᄋ', 'ᅧ', 'ᆻ', '/', 'E', 'P']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n",
-      "ᅧ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ']]\n",
-      "ᆻ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ', '/']]\n",
-      "E\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ', '/', 'E']]\n",
-      "P\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ', '/', 'E', 'P']]\n",
-      "겠/EP >> ['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']\n",
-      "print(token) == 겠/EP (사실 출력시에는 변화 X)\n",
-      "['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄀ\tFalse\t\t\tFalse\t\t[['ᄀ']]\n",
-      "ᅦ\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ']]\n",
-      "ᆻ\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ', '/']]\n",
-      "E\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ', '/', 'E']]\n",
-      "P\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']]\n",
-      "지만/EC >> ['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E', 'C']\n",
-      "print(token) == 지만/EC (사실 출력시에는 변화 X)\n",
-      "['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E', 'C']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄌ\tFalse\t\t\tFalse\t\t[['ᄌ']]\n",
-      "ᅵ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ']]\n",
-      "ᄆ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ']]\n",
-      "ᅡ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ']]\n",
-      "ᆫ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/']]\n",
-      "E\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E']]\n",
-      "C\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E', 'C']]\n",
-      "참으로/MAG >> ['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A', 'G']\n",
-      "print(token) == 참으로/MAG (사실 출력시에는 변화 X)\n",
-      "['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A', 'G']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄎ\tFalse\t\t\tFalse\t\t[['ᄎ']]\n",
-      "ᅡ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ']]\n",
-      "ᆷ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ']]\n",
-      "ᄋ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ']]\n",
-      "ᅳ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ']]\n",
-      "ᄅ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ']]\n",
-      "ᅩ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/']]\n",
-      "M\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M']]\n",
-      "A\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A']]\n",
-      "G\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A', 'G']]\n",
-      "유감/NNG >> ['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "print(token) == 유감/NNG (사실 출력시에는 변화 X)\n",
-      "['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N', 'G']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n",
-      "ᅲ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ']]\n",
-      "ᄀ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ']]\n",
-      "ᅡ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ']]\n",
-      "ᆷ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/']]\n",
-      "N\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N']]\n",
-      "N\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N']]\n",
-      "G\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N', 'G']]\n",
-      "이/JX >> ['ᄋ', 'ᅵ', '/', 'J', 'X']\n",
-      "print(token) == 이/JX (사실 출력시에는 변화 X)\n",
-      "['ᄋ', 'ᅵ', '/', 'J', 'X']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n",
-      "ᅵ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ', '/']]\n",
-      "J\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ', '/', 'J']]\n",
-      "X\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ', '/', 'J', 'X']]\n",
-      "야/EC >> ['ᄋ', 'ᅣ', '/', 'E', 'C']\n",
-      "print(token) == 야/EC (사실 출력시에는 변화 X)\n",
-      "['ᄋ', 'ᅣ', '/', 'E', 'C']\n",
-      "char\t_is_puntuation(char)\tstart_new_word\tOutput\n",
-      "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n",
-      "ᅣ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ']]\n",
-      "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ', '/']]\n",
-      "E\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ', '/', 'E']]\n",
-      "C\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ', '/', 'E', 'C']]\n",
-      "split_tokens: ['심사숙고/NNG', '하/XSV', '였/EP', '겠/EP', '지만/EC', '참으로/MAG', '유감/NNG', '이/JX', '야/EC']\n",
-      "output_tokens: ['심사숙고/NNG', '하/XSV', '였/EP', '겠/EP', '지만/EC', '참으로/MAG', '유감/NNG', '이/JX', '야/EC']\n"
-     ]
-    }
-   ],
-   "source": [
-    "# for loop을 적용하면 아래와 같이 된다.\n",
-    "split_tokens = []\n",
-    "for token in orig_tokens:\n",
-    "    #4. 음절을 정준분해\n",
-    "    print(token, end=' >> ')\n",
-    "    token = unicodedata.normalize(\"NFD\", token)\n",
-    "    print(list(token))\n",
-    "    print('print(token) ==', token, '(사실 출력시에는 변화 X)')\n",
-    "    # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d\n",
-    "    # 모든 음절을 정준 분해(Canonical Decomposition)시킴\n",
-    "    # '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일)\n",
-    "    output = []\n",
-    "    for char in token:\n",
-    "        cat = unicodedata.category(char)\n",
-    "        if cat == \"Mn\":\n",
-    "            # unicode category가 \"Mark, Nonspacing\"일 경우 pass\n",
-    "            continue\n",
-    "        output.append(char)\n",
-    "    token = ''.join(output) # if문에 해당하는 char가 없었기에 원본 text를 출력\n",
-    "                            # 정준분해된 상태임을 기억해라\n",
-    "    #5. punctuation 구분(사실상 의미가 없다)\n",
-    "    chars = list(token)\n",
-    "    i, start_new_word = 0, True\n",
-    "    output = []\n",
-    "    print(chars)\n",
-    "    print('char\\t_is_puntuation(char)\\tstart_new_word\\tOutput')\n",
-    "    while i < len(chars):\n",
-    "        char = chars[i]\n",
-    "        print(char, end='\\t')\n",
-    "        print(_is_punctuation(char), end='\\t\\t\\t')\n",
-    "        if _is_punctuation(char):\n",
-    "            print('In Here!! <CODE BLOCK \"IF _IS_PUNCTUATOIN\">')\n",
-    "            output.append([char])\n",
-    "            start_new_word = True\n",
-    "        else:\n",
-    "            if start_new_word:\n",
-    "                output.append([])\n",
-    "            start_new_word = False\n",
-    "            output[-1].append(char)\n",
-    "        print(start_new_word, end='\\t\\t')\n",
-    "        print(output)\n",
-    "        i += 1\n",
-    "    split_tokens.extend([\"\".join(x) for x in output])\n",
-    "print('split_tokens:', split_tokens)\n",
-    "output_tokens = whitespace_tokenize(\" \".join(split_tokens))\n",
-    "print('output_tokens:', output_tokens)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    ">### `BasicTokenizer` 파트 종료"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['심사숙고/NNG_',\n",
-       " '하/XSV_',\n",
-       " '였/EP_',\n",
-       " '겠/EP_',\n",
-       " '지만/EC_',\n",
-       " '참으로/MAG_',\n",
-       " '유감/NNG_',\n",
-       " '이/JX_',\n",
-       " '야/EC_']"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "output_tokens = [token + '_' for token in output_tokens]\n",
-    "output_tokens # ETRI 단어 사전에 맞게 form을 변경"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "unk_token = '[UNK]'\n",
-    "max_input_chars_per_word = 200"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 105,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vocab = morph_vocab"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 106,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "심사숙고/NNG_\n",
-      "심사숙고/NNG\n",
-      "심사숙고/NN\n",
-      "심사숙고/N\n",
-      "심사숙고/\n",
-      "심사숙고\n",
-      "심사숙ᄀ\n",
-      "심사숙\n",
-      "심사수\n",
-      "심사ᄉ\n",
-      "심사\n",
-      "심ᄉ\n",
-      "사숙고/NNG_\n",
-      "사숙고/NNG\n",
-      "사숙고/NN\n",
-      "사숙고/N\n",
-      "사숙고/\n",
-      "사숙고\n",
-      "사숙ᄀ\n",
-      "사숙\n",
-      "사수\n",
-      "사ᄉ\n",
-      "숙고/NNG_\n",
-      "숙고/NNG\n",
-      "숙고/NN\n",
-      "숙고/N\n",
-      "숙고/\n",
-      "숙고\n",
-      "숙ᄀ\n",
-      "참으로/MAG_\n",
-      "참으로/MAG\n",
-      "참으로/MA\n",
-      "참으로/M\n",
-      "참으로/\n",
-      "참으로\n",
-      "참으ᄅ\n",
-      "참으\n",
-      "참ᄋ\n",
-      "으로/MAG_\n",
-      "으로/MAG\n",
-      "으로/MA\n",
-      "으로/M\n",
-      "으로/\n",
-      "으로\n",
-      "으ᄅ\n",
-      "이/JX_\n",
-      "이/JX\n",
-      "이/J\n",
-      "이/\n"
-     ]
-    }
-   ],
-   "source": [
-    "SPLIT_TOKENS = []\n",
-    "for text in output_tokens:\n",
-    "    text = convert_to_unicode(text)\n",
-    "    _output_tokens = []\n",
-    "    # whitespacing 생략\n",
-    "    chars = list(text)\n",
-    "    if len(chars) > max_input_chars_per_word:\n",
-    "        _output_tokens.append(unk_token)\n",
-    "    is_bad = False\n",
-    "    start = 0\n",
-    "    sub_tokens = []\n",
-    "    while start < len(chars):\n",
-    "        end = len(chars)\n",
-    "        cur_substr = None\n",
-    "        while start < end:\n",
-    "            substr = \"\".join(chars[start:end])\n",
-    "            substr = unicodedata.normalize(\"NFC\", substr)\n",
-    "            if substr in vocab:\n",
-    "                cur_substr = substr\n",
-    "                break\n",
-    "            end -= 1\n",
-    "            print(substr)\n",
-    "        if cur_substr is None:\n",
-    "            is_bad = True\n",
-    "            break\n",
-    "        sub_tokens.append(cur_substr)\n",
-    "        start = end\n",
-    "    if is_bad:\n",
-    "        _output_tokens.append(unk_token)\n",
-    "    else:\n",
-    "        _output_tokens.extend(sub_tokens)\n",
-    "    \n",
-    "    for sub_token in _output_tokens:\n",
-    "        SPLIT_TOKENS.append(sub_token)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[['[CLS]' '심' '사' '숙' '고/NNG_' '하/XSV_' '였/EP_' '겠/EP_' '지만/EC_' '참' '으'\n",
-      "  '로/MAG_' '유감/NNG_' '이' '/JX_' '야/EC_' '[SEP]']\n",
-      " ['2' '855' '174' '2341' '576' '9' '840' '124' '74' '1855' '2392' '2337'\n",
-      "  '6770' '134' '3087' '4741' '3']]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "print(np.vstack((tokens, [morph_vocab[token] for token in tokens])))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 111,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "max_seq_length = 200"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 114,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "False"
-      ]
-     },
-     "execution_count": 114,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(SPLIT_TOKENS) > max_seq_length - 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# The convention in BERT is:\n",
-    "# (a) For sequence pairs:\n",
-    "# tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n",
-    "# type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1\n",
-    "# (b) For single sequences:\n",
-    "# tokens:   [CLS] the dog is hairy . [SEP]\n",
-    "# type_ids: 0     0   0   0  0     0 0\n",
-    "#\n",
-    "# Where \"type_ids\" are used to indicate whether this is the first\n",
-    "# sequence or the second sequence. The embedding vectors for 'type=0' and\n",
-    "# 'type=1' were learned during pre-training and are added to the wordpiece\n",
-    "# embedding vector (and position vector). This is not \"strictly\" necessary\n",
-    "# since the [SEP] token unambigiously separates the sequences, but it makes\n",
-    "# if easier for the model to learn the concept of sequences.\n",
-    "#\n",
-    "# For classification tasks, the first vector (corresponding to [CLS]) is\n",
-    "# used as the \"sentence vector\". Note that this only makes sense because\n",
-    "# the entire model is fine-tuned.\n",
-    "\n",
-    "tokens = [\"[CLS]\"] + SPLIT_TOKENS + [\"[SEP]\"]\n",
-    "segment_ids = [0] * len(tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 118,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['[CLS]', '심', '사', '숙', '고/NNG_', '하/XSV_', '였/EP_', '겠/EP_', '지만/EC_', '참', '으', '로/MAG_', '유감/NNG_', '이', '/JX_', '야/EC_', '[SEP]']\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tokens)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 117,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
-      ]
-     },
-     "execution_count": 117,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "segment_ids"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 121,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[2, 855, 174, 2341, 576, 9, 840, 124, 74, 1855, 2392, 2337, 6770, 134, 3087, 4741, 3]\n"
-     ]
-    }
-   ],
-   "source": [
-    "input_ids = [morph_vocab[token] for token in tokens]\n",
-    "print(input_ids)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
-      ]
-     },
-     "execution_count": 122,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# The mask has 1 for real tokens and 0 for padding tokens. Only real\n",
-    "# tokens are attended to.\n",
-    "input_mask = [1] * len(input_ids)\n",
-    "input_mask"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Zero-pad up to the sequence length.\n",
-    "padding = [0] * (max_seq_length - len(input_ids))\n",
-    "input_ids += padding\n",
-    "input_mask += padding\n",
-    "segment_ids += padding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 124,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assert len(input_ids) == max_seq_length\n",
-    "assert len(input_mask) == max_seq_length\n",
-    "assert len(segment_ids) == max_seq_length"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "basic",
-   "language": "python",
-   "name": "basic"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.9"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}