diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 7fec515..0000000 --- a/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/BART/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/BART/.ipynb_checkpoints/Untitled-checkpoint.ipynb deleted file mode 100644 index 7fec515..0000000 --- a/BART/.ipynb_checkpoints/Untitled-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/BART/.ipynb_checkpoints/Untitled1-checkpoint.ipynb b/BART/.ipynb_checkpoints/Untitled1-checkpoint.ipynb deleted file mode 100644 index 7fec515..0000000 --- a/BART/.ipynb_checkpoints/Untitled1-checkpoint.ipynb +++ /dev/null @@ -1,6 +0,0 @@ -{ - "cells": [], - "metadata": {}, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/BART/Untitled.ipynb b/BART/Untitled.ipynb deleted file mode 100644 index f0dab22..0000000 --- a/BART/Untitled.ipynb +++ /dev/null @@ -1,203 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class MultiHeadAttention(nn.Module):\n", - " \n", - " def __init__(\n", - " self,\n", - " d_model: int,\n", - " num_heads: int,\n", - " dropout: float = 0.0,\n", - " is_decoder: bool = False,\n", - " bias: bool = True\n", - " ):\n", - " super().__init__()\n", - " self.d_model = d_model\n", - " self.num_heads = \n", - " self.W_q = nn.Linear(d_model, d_model, bias)\n", - " self.W_k = nn.Linear(d_model, d_model, bias)\n", - " self.W_v = nn.Linear(d_model, d_model, bias)\n", - " \n", - " def forward(\n", - " self, \n", - " hidden_states: torch.Tensor, \n", - " encoder_hidden_states: torch.Tensor = None,\n", - " \n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):\n", - " \"\"\"\n", - " Make causal mask used for bi-directional self-attention.\n", - " \"\"\"\n", - " bsz, tgt_len = input_ids_shape\n", - " mask = torch.full((tgt_len, tgt_len), float(\"-inf\"))\n", - " mask_cond = torch.arange(mask.size(-1))\n", - " mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)\n", - " mask = mask.to(dtype)\n", - "\n", - " if past_key_values_length > 0:\n", - " mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)\n", - " return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):\n", - " \"\"\"\n", - " Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.\n", - " \"\"\"\n", - " bsz, src_len = mask.size()\n", - " tgt_len = tgt_len if tgt_len is not None else src_len\n", - "\n", - " expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)\n", - "\n", - " inverted_mask = 1.0 - expanded_mask\n", - "\n", - " return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import BartModel, BartConfig" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "config = BartConfig()\n", - "bart = BartModel(config)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config.pad_token_id" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "attention_mask, input_ids = bart.dummy_inputs.values()" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-3.4028234663852886e+38\n" - ] - }, - { - "data": { - "text/plain": [ - "tensor([[[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]]],\n", - "\n", - "\n", - " [[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.4028e+38],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.4028e+38],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.4028e+38],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.4028e+38],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.4028e+38],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.4028e+38],\n", - " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, -3.4028e+38]]]])" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bsz, src_len = attention_mask.size()\n", - "tgt_len = 7\n", - "dtype = torch.nn.Embedding(10000, 10)(input_ids).dtype\n", - "expand_mask = attention_mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len)\n", - "inverted_mask = 1.0 - expand_mask.to(dtype)\n", - "print(torch.finfo(dtype).min)\n", - "inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "basic", - "language": "python", - "name": "basic" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/BART/Untitled1.ipynb b/BART/Untitled1.ipynb deleted file mode 100644 index 04ddbbc..0000000 --- a/BART/Untitled1.ipynb +++ /dev/null @@ -1,2310 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BARTModel 분석" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import math\n", - "import random\n", - "import warnings\n", - "from typing import Any, Dict, Tuple, Union, Optional, List\n", - "\n", - "import numpy as np\n", - "from overrides import overrides\n", - "\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.utils.checkpoint\n", - "import torch.nn.functional as F\n", - "\n", - "import transformers\n", - "from transformers import BartConfig, BartTokenizer, BartModel\n", - "from transformers.models.bart.modeling_bart import BartEncoder, BartDecoder\n", - "from transformers.utils import logging\n", - "from transformers.modeling_utils import PreTrainedModel" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.__version__ == 1.7.1+cu110\n", - "4.2.1\n", - "True\n" - ] - } - ], - "source": [ - "print(f\"torch.__version__ == {torch.__version__}\")\n", - "print(transformers.__version__)\n", - "print(torch.cuda.is_available())" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "config = BartConfig()\n", - "bart = BartModel(config)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BartModel.\\_\\_init\\_\\_\n", - "- BartPretrainedModel을 상속받음" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1, 50265)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.config.pad_token_id, bart.config.vocab_size" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Embedding(50265, 1024, padding_idx=1)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.shared # torch.nn.Embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- `BartPretrainedModel`의 init_weight 메서드 실시, 뜯어보자\n", - "- `nn.Linear`, `nn.Embedding`의 경우 config의 std로 초기값 조정" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "class BartPretrainedModel(PreTrainedModel):\n", - " config_class = BartConfig\n", - " base_model_prefix = \"model\"\n", - "\n", - " def _init_weights(self, module):\n", - " std = self.config.init_std\n", - " if isinstance(module, nn.Linear):\n", - " module.weight.data.normal_(mean=0.0, std=std)\n", - " if module.bias is not None:\n", - " module.bias.data.zero_()\n", - " elif isinstance(module, BartSinusoidalPositionalEmbedding):\n", - " pass\n", - " elif isinstance(module, nn.Embedding):\n", - " module.weight.data.normal_(mean=0.0, std=std)\n", - " if module.padding_idx is not None:\n", - " module.weight.data[module.padding_idx].zero_()\n", - "\n", - " @property\n", - " def dummy_inputs(self):\n", - " pad_token = self.config.pad_token_id\n", - " input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)\n", - " dummy_inputs = {\n", - " \"attention_mask\": input_ids.ne(pad_token),\n", - " \"input_ids\": input_ids,\n", - " }\n", - " return dummy_inputs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">### BartEncoder\n", - ">- `BartPretrainedModel` 객체를 동일하게 상속받음\n", - ">### BartEncoder.\\_\\_init\\_\\_" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "dropout: 0.1,\n", - "layerdrop: 0.0\n", - "embed_dim: 1024,\n", - "embed_scale: 1.0,\n", - "padding_idx: 1,\n", - "max_source_positions: 1024\n" - ] - } - ], - "source": [ - "print(f\"\"\"\n", - "dropout: {bart.encoder.dropout},\n", - "layerdrop: {bart.config.encoder_layerdrop}\n", - "embed_dim: {bart.config.d_model},\n", - "embed_scale: {math.sqrt(bart.config.d_model) if config.scale_embedding else 1.0},\n", - "padding_idx: {bart.config.pad_token_id},\n", - "max_source_positions: {bart.config.max_position_embeddings}\n", - "\"\"\".strip())" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Embedding(50265, 1024, padding_idx=1)" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.encoder.embed_tokens # __init__에서 받아올 수 있음" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "False\n" - ] - }, - { - "data": { - "text/plain": [ - "BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(bart.config.static_position_embeddings) # 21.01.06 commit으로 삭제!\n", - "bart.encoder.embed_positions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- config.static_position_embeddings에 따라 어떤 객체를 사용할지 갈림\n", - " - if True, `BartSinusoidalPositionalEmbedding`\n", - " - else: `BartLearnedPositionalEmbedding`\n", - "- config.encoder_layers의 수만큼 EncoderLayer를 쌓음" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "12" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config.encoder_layers" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "BartEncoderLayer(\n", - " (self_attn): BartAttention(\n", - " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " )\n", - " (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", - " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", - " (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - ")" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.encoder.layers[0] # 이 layer를 12개 쌓음" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- layernorm을 어떻게 적용할지 보자" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n" - ] - } - ], - "source": [ - "if config.normalize_embedding:\n", - " print(BartLayerNorm(embed_dim))\n", - "else:\n", - " print(nn.Identity())" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "None\n" - ] - } - ], - "source": [ - "if config.add_final_layer_norm:\n", - " print(BartLayerNorm(config.d_model))\n", - "else:\n", - " print(None)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "def BartLayerNorm(\n", - " normalized_shape: torch.Size, eps: float = 1e-5, elementwise_affine: bool = True\n", - "):\n", - " if torch.cuda.is_available():\n", - " try:\n", - " from apex.normalization import FusedLayerNorm\n", - "\n", - " return FusedLayerNorm(normalized_shape, eps, elementwise_affine)\n", - " except ImportError:\n", - " pass\n", - " return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)" - ] - }, - { - "cell_type": "code", - "execution_count": 171, - "metadata": {}, - "outputs": [], - "source": [ - "hidden_states= torch.randn(2, 5, 1024)" - ] - }, - { - "cell_type": "code", - "execution_count": 172, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[-0.7331, -0.4191, 0.7951, ..., 0.4873, -0.5494, -0.2944],\n", - " [-0.7643, 1.8104, -0.0323, ..., -0.4546, 0.5776, -0.7373],\n", - " [-1.1619, 1.9948, 0.4805, ..., -1.0691, -0.7803, 0.6411],\n", - " [ 0.0236, 0.1118, -0.2880, ..., -1.5818, 0.1992, -0.9446],\n", - " [ 0.3735, -1.4478, 0.8767, ..., 1.2091, -0.4567, 0.4698]],\n", - "\n", - " [[ 0.4276, -1.4758, 0.0165, ..., 1.9631, -0.1555, -1.0019],\n", - " [ 0.6768, -0.3537, 0.9676, ..., -1.3469, -0.1781, 1.4861],\n", - " [ 0.5480, -1.0024, 0.4656, ..., 0.5370, -0.4840, 0.0959],\n", - " [-1.5319, 0.8093, -0.3881, ..., -0.5653, -0.3972, 0.5072],\n", - " [-1.0625, 0.7415, -0.8247, ..., 0.1221, -0.1593, -0.0284]]],\n", - " grad_fn=)" - ] - }, - "execution_count": 172, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nn.LayerNorm(1024)(hidden_states)" - ] - }, - { - "cell_type": "code", - "execution_count": 173, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[-0.7331, -0.4191, 0.7951, ..., 0.4873, -0.5494, -0.2944],\n", - " [-0.7643, 1.8104, -0.0323, ..., -0.4546, 0.5776, -0.7373],\n", - " [-1.1619, 1.9948, 0.4805, ..., -1.0691, -0.7803, 0.6411],\n", - " [ 0.0236, 0.1118, -0.2880, ..., -1.5819, 0.1992, -0.9446],\n", - " [ 0.3735, -1.4478, 0.8767, ..., 1.2091, -0.4567, 0.4698]],\n", - "\n", - " [[ 0.4276, -1.4758, 0.0165, ..., 1.9631, -0.1555, -1.0019],\n", - " [ 0.6768, -0.3537, 0.9676, ..., -1.3469, -0.1781, 1.4862],\n", - " [ 0.5480, -1.0024, 0.4656, ..., 0.5370, -0.4840, 0.0959],\n", - " [-1.5319, 0.8093, -0.3881, ..., -0.5653, -0.3972, 0.5072],\n", - " [-1.0625, 0.7415, -0.8247, ..., 0.1221, -0.1593, -0.0284]]])" - ] - }, - "execution_count": 173, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "mean = hidden_states.mean(dim=-1)\n", - "mean = mean[:, :, None].expand((*mean.size(), 1024))\n", - "std = hidden_states.std(dim=-1, unbiased=False)\n", - "std = std[:, :, None].expand((*std.size(), 1024))\n", - "\n", - "(hidden_states - mean) / std" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">### BartEncoder.forward\n", - ">#### Ch0. forward의 input\n", - ">```python\n", - "input_ids = None,\n", - "attention_mask = None,\n", - "inputs_embeds = None,\n", - "output_attentions = None,\n", - "output_hidden_states = None,\n", - "return_dict = None,\n", - ">```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">#### Ch1. config으로 input setting" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "False\n", - "(False,)\n", - "True\n" - ] - } - ], - "source": [ - "print(config.output_attentions)\n", - "print((config.output_hidden_states,))\n", - "print(config.use_return_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">- input을 아래의 코드로 처리\n", - ">```python\n", - "\\# retrieve input_ids and inputs_embeds\n", - "if input_ids is not None and inputs_embeds is not None:\n", - " raise ValueError(\"You cannot specify both input_ids and inputs_embeds at the same time\")\n", - "elif input_ids is not None:\n", - " input_shape = input_ids.size()\n", - " input_ids = input_ids.view(-1, input_shape[-1])\n", - "elif inputs_embeds is not None:\n", - " input_shape = inputs_embeds.size()[:-1]\n", - "else:\n", - " raise ValueError(\"You have to specify either input_ids or inputs_embeds\")\n", - "if inputs_embeds is None:\n", - " inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale\n", - ">```" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.encoder.embed_positions" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([5, 1024])" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.encoder.embed_positions(bart.dummy_inputs['input_ids'].size()).size()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 5])" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">#### 아래의 과정으로 처리\n", - ">- input을 embedding (주어진 경우는 그냥 넘어감)\n", - ">- input_shape으로 position vector 얻음\n", - ">- hidden_states를 input_embeds + embeds_pos로 계산\n", - ">- hidden_states를 LayerNorm해주고 Dropout 실시\n", - ">- attention_mask가 None이 아니면 아래 코드로 expand (**디코더랑 처리가 조금 다름!**)\n", - ">- output_hidden_states, output_attentions이 None이 아니면,\n", - " - () tuple을 주고 None이면 그냥 None" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]=None):\n", - " bsz, src_len = mask.size()\n", - " tgt_len = tgt_len if tgt_len is not None else src_len\n", - " expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)\n", - " inverted_mask = 1.0 - expanded_mask\n", - " return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">- 그리고 나선, BartEncoderLayer별로 아래의 연산을 수행\n", - ">```python\n", - "for encoder_layer in self.layers:\n", - " if output_hidden_states:\n", - " encoder_states = encoder_states + (hidden_states,)\n", - " # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)\n", - " dropout_probability = random.uniform(0, 1)\n", - " if self.training and (dropout_probability < self.layerdrop): # skip the layer\n", - " attn = None\n", - " else:\n", - " hidden_states, attn = encoder_layer(\n", - " hidden_states, \n", - " attention_mask, \n", - " output_attentions=output_attentions\n", - " )\n", - " if output_attentions:\n", - " all_attentions = all_attentions + (attn,)\n", - ">```\n", - ">- 그 다음, layer_norm이 None이 아니면 hidden_states를 layer normalization\n", - ">- output_hidden_states가 None이 아니면, encoder_states에 (hidden_states,)를 더해줌\n", - ">- return_dict가 True인지 False인지에 따라 출력 결과물이 달라짐\n", - " - `False`: \n", - " ```python \n", - " tuple(\n", - " v for v in [\n", - " hidden_states, encoder_states, all_attentions\n", - " ] if v is not None\n", - " )\n", - " ```\n", - " - `True`:\n", - " ```python\n", - " BaseModelOutput(\n", - " last_hidden_state=hidden_states, \n", - " hidden_states=encoder_states, \n", - " attentions=all_attentions\n", - " )\n", - " ```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">### BartDecoder\n", - ">- `BartPretrainedModel` 객체를 동일하게 상속받음\n", - ">### BartDecoder.\\_\\_init\\_\\_" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "dropout: 0.1\n", - "layerdrop: 0.0\n", - "embed_dim: 1024,\n", - "embed_scale: 1.0,\n", - "padding_idx: 1,\n", - "max_source_positions: 1024\n" - ] - } - ], - "source": [ - "dropout = config.dropout\n", - "layerdrop = config.decoder_layerdrop\n", - "\n", - "embed_dim = config.d_model\n", - "embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0\n", - "padding_idx = config.pad_token_id\n", - "max_source_positions = config.max_position_embeddings\n", - "\n", - "print(f\"\"\"\n", - "dropout: {dropout}\n", - "layerdrop: {layerdrop}\n", - "embed_dim: {embed_dim},\n", - "embed_scale: {embed_scale},\n", - "padding_idx: {padding_idx},\n", - "max_source_positions: {max_source_positions}\n", - "\"\"\".strip())" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 63, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Decoder 차별점\n", - "do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm # layernorm variant\n", - "do_blenderbot_90_layernorm" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "embed_tokens: Optional[nn.Embedding] = None\n", - "\n", - "# None이면\n", - "embed_tokens = nn.Embedding(config.vocab_size, embed_dim, padding_idx)\n", - "\n", - "# None이 아니면\n", - "embed_tokens = embed_tokens" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- config.static_position_embeddings에 따라 어떤 객체를 사용할지 갈림\n", - " - if True, `BartSinusoidalPositionalEmbedding`\n", - " - else: `BartLearnedPositionalEmbedding`\n", - "- config.encoder_layers의 수만큼 EncoderLayer를 쌓음" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "12" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config.decoder_layers" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "BartDecoderLayer(\n", - " (self_attn): BartAttention(\n", - " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " )\n", - " (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - " (encoder_attn): BartAttention(\n", - " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", - " )\n", - " (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n", - " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n", - " (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", - ")" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.decoder.layers[0] # 이 layer를 12개 쌓음" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n" - ] - } - ], - "source": [ - "if config.normalize_embedding:\n", - " print(BartLayerNorm(embed_dim)) # config.d_model과 동일\n", - "else:\n", - " print(nn.Identity())" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "None\n" - ] - } - ], - "source": [ - "if config.add_final_layer_norm:\n", - " print(BartLayerNorm(config.d_model))\n", - "else:\n", - " print(None)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">### BartDecoder.forward\n", - ">#### Ch0. forward의 input\n", - ">```python\n", - "input_ids=None,\n", - "attention_mask=None,\n", - "encoder_hidden_states=None,\n", - "encoder_attention_mask=None,\n", - "past_key_values=None,\n", - "inputs_embeds=None,\n", - "use_cache=None,\n", - "output_attentions=None,\n", - "output_hidden_states=None,\n", - "return_dict=None,\n", - ">```" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "False\n", - "(False,)\n", - "True\n", - "True\n" - ] - } - ], - "source": [ - "print(config.output_attentions)\n", - "print((config.output_hidden_states,))\n", - "print(config.use_cache) # 차이점\n", - "print(config.use_return_dict)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">- input을 아래의 코드로 처리\n", - ">```python\n", - "\\# retrieve input_ids and inputs_embeds\n", - "if input_ids is not None and inputs_embeds is not None:\n", - " raise ValueError(\"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time\")\n", - "elif input_ids is not None:\n", - " input_shape = input_ids.size()\n", - " input_ids = input_ids.view(-1, input_shape[-1])\n", - "elif inputs_embeds is not None:\n", - " input_shape = inputs_embeds.size()[:-1]\n", - "else:\n", - " raise ValueError(\"You have to specify either decoder_input_ids or decoder_inputs_embeds\")\n", - "if inputs_embeds is None:\n", - " inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale\n", - ">```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Decoder 차이점" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# past_key_values_length\n", - "past_key_values = None\n", - "past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0\n", - "past_key_values_length" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Decoder 차이점" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": {}, - "outputs": [], - "source": [ - "attention_mask = bart.dummy_inputs['attention_mask']\n", - "input_shape = bart.dummy_inputs['input_ids'].size()\n", - "input_ids = bart.dummy_inputs['input_ids'].view(-1, input_shape[-1])\n", - "inputs_embeds = embed_tokens(input_ids) * embed_scale " - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [], - "source": [ - "def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):\n", - " \"\"\"\n", - " Make causal mask used for bi-directional self-attention.\n", - " \"\"\"\n", - " bsz, tgt_len = input_ids_shape\n", - " mask = torch.full((tgt_len, tgt_len), float(\"-inf\"))\n", - " mask_cond = torch.arange(mask.size(-1))\n", - " mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)\n", - " mask = mask.to(dtype)\n", - "\n", - " if past_key_values_length > 0:\n", - " mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)\n", - " return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [], - "source": [ - "# Attentoin Mask 처리\n", - "\n", - "# Create causal mask\n", - "# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, srxc_seq_len]\n", - "combined_attention_mask = None\n", - "if input_shape[-1] > 1: # 걍 무조건 하는거나 다름없음\n", - " combined_attention_mask = _make_causal_mask(\n", - " input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 1, 5, 5])" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "combined_attention_mask.size()" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [], - "source": [ - "# create decoder_padding_mask if not provided and needed\n", - "# 4.12.20 (PVP): Not a fan of this \"magical\" function that\n", - "# automatically creates attention_mask for padded tokens\n", - "# => this is inconsistent with other models\n", - "# => Pegasus uses the pad_token as decoder_start_token_id, so that this could\n", - "# pose some problems.\n", - "if (\n", - " attention_mask is None\n", - " and input_ids is not None\n", - " and input_shape[-1] > 1\n", - " and config.pad_token_id in input_ids\n", - "):\n", - " # should be kept for backwards compatibility\n", - " attention_mask = input_ids.ne(config.pad_token_id).to(torch.long)\n", - " # never mask leading token, even if it is pad\n", - " attention_mask[:, 0] = attention_mask[:, 1]\n", - " if past_key_values_length > 0:\n", - " attention_mask = torch.cat(\n", - " [\n", - " torch.ones(\n", - " (input_shape[0], past_key_values_length), dtype=torch.long, device=input_ids.device\n", - " ),\n", - " attention_mask,\n", - " ],\n", - " dim=-1,\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 92, - "metadata": {}, - "outputs": [], - "source": [ - "res = bart.encoder(**bart.dummy_inputs, return_dict=True, output_attentions=True, output_hidden_states=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 16, 5, 5])" - ] - }, - "execution_count": 97, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "res.attentions[-1].size()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# expand encoder attention mask\n", - "encoder_hidden_states = None\n", - "encoder_attention_mask = None\n", - "\n", - "# BartModel에서 encoder의 결과값을 Decoder에 넣어줌!\n", - "if encoder_hidden_states is not None and encoder_attention_mask is not None:\n", - " # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]\n", - " encoder_attention_mask = _expand_mask(\n", - " encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])\n", - "else:\n", - " print('지금은 None!')" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "if attention_mask is not None and combined_attention_mask is not None:\n", - " # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]\n", - " combined_attention_mask = combined_attention_mask + _expand_mask(\n", - " attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">#### 아래의 과정으로 처리! (Encoder와 유사)\n", - ">- input_shape으로 position vector 얻음, past_key_values_length도 넣어줌\n", - ">\n", - ">#### Decoder에서 다른 점!\n", - ">- do_blenderbot_90_layernorm이 True가 아니라면 Encoder와 동일하게 계산\n", - " - hidden_states를 input_embeds + embeds_pos로 계산\n", - " - hidden_states를 LayerNorm해줌\n", - ">- do_blenderbot_90_layernorm가 True면\n", - " - inputs_embeds를 LayerNorm해주고 (이 결과값이 hidden_states)\n", - " - hidden_states에 embeds_pos를 더해줌\n", - ">- 이 후, Dropout\n", - ">- 그리고 나서 아래 값들에 대해 Tuple을 할당\n", - ">```python\n", - "all_hidden_states = () if output_hidden_states else None\n", - "all_self_attns = () if output_attentions else None\n", - "all_cross_attentions = () if output_attentions else None\n", - "next_decoder_cache = () if use_cache else None\n", - ">```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">- 그리고 나선, BartDecoderLayer별로 아래의 연산을 수행\n", - ">```python\n", - "for idx, decoder_layer in enumerate(self.layers): # Add idx\n", - " # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)\n", - " if output_hidden_states:\n", - " # encoder_states = encoder_states + (hidden_states,) # Encoder\n", - " all_hidden_states += (hidden_states,)\n", - " dropout_probability = random.uniform(0, 1)\n", - " if self.training and (dropout_probability < self.layerdrop): # skip the layer\n", - " # attn = None\n", - " # Encoder에선 if output_attentions: 구문을 도는데\n", - " # Decoder에선 걍 continue\n", - " continue\n", - " hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer(\n", - " hidden_states, \n", - " attention_mask=combined_attention_mask, \n", - " encoder_hidden_states=encoder_hidden_states,\n", - " encoder_attention_mask=encoder_attention_mask,\n", - " past_key_value=past_key_value,\n", - " output_attentions=output_attentions,\n", - " )\n", - " # Decoder에서 추가된 부분\n", - " if use_cache:\n", - " next_decoder_cache += (present_key_value,)\n", - " if output_attentions:\n", - " # all_attentions = all_attentions + (attn,)\n", - " all_self_attns += (layer_self_attn,)\n", - " all_cross_attentions += (layer_cross_attn,)\n", - "if output_hidden_states: # add hidden states from the last decoder layer\n", - " all_hidden_states += (hidden_states,)\n", - ">```\n", - ">- output_hidden_states가 None이 아니면, all_hidden_states에 (hidden_states,)를 더해줌\n", - " - encoder_states였었음\n", - ">- 그 다음, layer_norm이 None이 아니면 hidden_states를 layer normalization\n", - ">- use_cache가 True면 next_decoder_cache를, 아니면 None을 next_cache에 할당\n", - ">- return_dict가 True인지 False인지에 따라 출력 결과물이 달라짐\n", - " - `False`: \n", - " ```python \n", - " tuple(\n", - " v for v in [\n", - " hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions\n", - " ] if v is not None\n", - " )\n", - " ```\n", - " - `True`:\n", - " ```python\n", - " BaseModelOutputWithPastAndCrossAttentions(\n", - " last_hidden_state=hidden_states,\n", - " past_key_values=next_cache,\n", - " hidden_states=all_hidden_states,\n", - " attentions=all_self_attns,\n", - " cross_attentions=all_cross_attentions,\n", - " )\n", - " ```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BartModel.forward" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ch0. forward의 input" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```python\n", - "input_ids = None\n", - "attention_mask = None\n", - "decoder_input_ids = None\n", - "decoder_attention_mask = None\n", - "encoder_outputs = None\n", - "past_key_values = None\n", - "inputs_embeds = None\n", - "decoder_inputs_embeds = None\n", - "use_cache = None\n", - "output_attentions = None\n", - "output_hidden_states = None\n", - "return_dict = None\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 1. config으로 input setting" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n", - "False\n", - "False\n", - "True\n", - "True\n" - ] - } - ], - "source": [ - "p(config.pad_token_id)\n", - "p(config.output_attentions)\n", - "p(config.output_hidden_states)\n", - "p(config.use_cache)\n", - "p(config.use_return_dict)" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [], - "source": [ - "input_ids = bart.dummy_inputs['input_ids']" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 5, 1024])" - ] - }, - "execution_count": 113, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.encoder(input_ids).last_hidden_state.size()" - ] - }, - { - "cell_type": "code", - "execution_count": 125, - "metadata": {}, - "outputs": [], - "source": [ - "decoder_output = bart.decoder(\n", - " input_ids, encoder_hidden_states=bart.encoder(input_ids).last_hidden_state)" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "odict_keys(['last_hidden_state', 'past_key_values'])" - ] - }, - "execution_count": 126, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "decoder_output.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 5, 1024])" - ] - }, - "execution_count": 127, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "decoder_output.last_hidden_state.size()" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [], - "source": [ - "reconstruction = torch.randn(32, 10, 1024)\n", - "reconstruction = torch.softmax(reconstruction, dim=-1)\n", - "\n", - "clean = torch.LongTensor(32, 10).random_(10000)" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Expected target size (32, 1024), got torch.Size([32, 10])", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mnn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mCrossEntropyLoss\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreconstruction\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mclean\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\modules\\module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[1;34m(self, *input, **kwargs)\u001b[0m\n\u001b[0;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 726\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 727\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[0;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\modules\\loss.py\u001b[0m in \u001b[0;36mforward\u001b[1;34m(self, input, target)\u001b[0m\n\u001b[0;32m 960\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 961\u001b[0m return F.cross_entropy(input, target, weight=self.weight,\n\u001b[1;32m--> 962\u001b[1;33m ignore_index=self.ignore_index, reduction=self.reduction)\n\u001b[0m\u001b[0;32m 963\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 964\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\functional.py\u001b[0m in \u001b[0;36mcross_entropy\u001b[1;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[0;32m 2466\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0msize_average\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mreduce\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2467\u001b[0m \u001b[0mreduction\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_Reduction\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlegacy_get_string\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize_average\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2468\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mnll_loss\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mlog_softmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minput\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mweight\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreduction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2469\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2470\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\AppData\\Local\\Continuum\\anaconda3\\envs\\basic\\lib\\site-packages\\torch\\nn\\functional.py\u001b[0m in \u001b[0;36mnll_loss\u001b[1;34m(input, target, weight, size_average, ignore_index, reduce, reduction)\u001b[0m\n\u001b[0;32m 2272\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2273\u001b[0m raise ValueError('Expected target size {}, got {}'.format(\n\u001b[1;32m-> 2274\u001b[1;33m out_size, target.size()))\n\u001b[0m\u001b[0;32m 2275\u001b[0m \u001b[0minput\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minput\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontiguous\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2276\u001b[0m \u001b[0mtarget\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontiguous\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mValueError\u001b[0m: Expected target size (32, 1024), got torch.Size([32, 10])" - ] - } - ], - "source": [ - "nn.CrossEntropyLoss()(reconstruction, clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{torch.Size([2, 16, 5, 64])}" - ] - }, - "execution_count": 130, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "set([v.size() for pkv in decoder_output.past_key_values for v in pkv])" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [], - "source": [ - "def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):\n", - " \"\"\"\n", - " Shift input ids one token to the right, and wrap the last non pad token (usually ).\n", - " \"\"\"\n", - " prev_output_tokens = input_ids.clone()\n", - "\n", - " assert pad_token_id is not None, \"self.model.config.pad_token_id has to be defined.\"\n", - " # replace possible -100 values in labels by `pad_token_id`\n", - " prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)\n", - "\n", - " index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)\n", - " decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()\n", - " prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()\n", - " prev_output_tokens[:, 0] = decoder_start_tokens\n", - "\n", - " return prev_output_tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 0, 6, 10, 4, 2],\n", - " [ 0, 8, 12, 2, 1]])" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "input_ids" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import BartTokenizer\n", - "\n", - "tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "2" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.eos_token_id" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 2, 0, 6, 10, 4],\n", - " [ 2, 0, 8, 12, 2]])" - ] - }, - "execution_count": 105, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "shift_tokens_right(input_ids, 1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ch2. Encoder\n", - "#### Ch2.번외 Enc output -> BaseModelOutput setting\n", - "\n", - "- return_dict가 True이고\n", - "- encoder_outputs이 BaseModelOutput 객체가 아니면\n", - "- 아래 코드로 형변환시켜줌\n", - "\n", - "```python\n", - "encoder_outputs = BaseModelOutput(\n", - " last_hidden_state=encoder_outputs[0],\n", - " hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,\n", - " attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,\n", - ")\n", - "\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ch3. Decoder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Ch4. 최종 output\n", - "- return_dict가 False일 경우엔\n", - " - decoder_outputs + encoder_outputs를 출력\n", - "- 그 외의 경우엔\n", - " - Seq2SeqModelOutput에 결과값을 입력 후 출력" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "output = bart(**bart.dummy_inputs)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "odict_keys(['last_hidden_state', 'past_key_values', 'encoder_last_hidden_state'])" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 5, 1024])" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output.last_hidden_state.size()" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{torch.Size([2, 16, 5, 64])}" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "set([v.size() for pkv in output.past_key_values for v in pkv])" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1024, 1024, 16, 16)" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "16 * 64, config.d_model, config.encoder_attention_heads, config.decoder_attention_heads" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "torch.Size([2, 5, 1024])" - ] - }, - "execution_count": 74, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output.encoder_last_hidden_state.size()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Other Methods" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class BartModel(BartPretrainedModel):\n", - " \n", - " def __init__(self, config: BartConfig):\n", - " pass\n", - " \n", - " @overrides\n", - " def get_input_embeddings(self):\n", - " return self.shared\n", - " \n", - " @overrides\n", - " def set_input_embeddings(self, value):\n", - " self.shared = value\n", - " self.encoder.embed_tokens = self.shared\n", - " self.decoder.embed_tokens = self.shared\n", - " \n", - " def get_encoder(self):\n", - " return self.encoder\n", - " \n", - " def get_decoder(self):\n", - " return self.decoder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 번외" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### PreTrainedModel 분석" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers.configuration_utils import PretrainedConfig\n", - "\n", - "# file_utils.py\n", - "DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]\n", - "DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]\n", - "\n", - "pt_config = PretrainedConfig()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers.modeling_utils import ModuleUtilsMixin\n", - "from transformers.generation_utils import GenerationMixin # Beam Search 파보쟈" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(False, True)" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pt_config.is_encoder_decoder, pt_config.tie_word_embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(True, True)" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config.is_encoder_decoder, config.tie_word_embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "({}, {})" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "config.pruned_heads, pt_config.pruned_heads" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": {}, - "outputs": [], - "source": [ - "class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):\n", - " config_class = None\n", - " base_model_prefix = \"\"\n", - " _keys_to_ignore_on_load_missing = None\n", - " _keys_to_ignore_on_load_unexpected = None\n", - " _keys_to_ignore_on_save = None\n", - " \n", - " @property\n", - " def dummy_inputs(self) -> Dict[str, torch.Tensor]:\n", - " return {'input_ids': torch.tensor(DUMMY_INPUTS)}\n", - " \n", - " def __init__(self, config: PretrainedConfig, *inputs, **kwargs):\n", - " super().__init__()\n", - " if not isinstance(config, PretrainedConfig):\n", - " raise ValueError(\n", - " \"Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. \"\n", - " \"To create a model from a pretrained model use \"\n", - " \"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`\".format(\n", - " self.__class__.__name__, self.__class__.__name__\n", - " )\n", - " )\n", - " # Save config and origin of the pretrained weights if given in model\n", - " self.config = config\n", - " self.name_or_path = config.name_or_path\n", - " \n", - " @property\n", - " def base_model(self) -> nn.Module:\n", - " return getattr(self, self.base_model_prefix, self)\n", - " \n", - " def get_input_embeddings(self) -> nn.Module:\n", - " base_model = getattr(self, self.base_model_prefix, self)\n", - " if base_model is not self:\n", - " return base_model.get_input_embeddings()\n", - " else:\n", - " raise NotImplementedError\n", - " \n", - " def set_input_embeddings(self, value: nn.Module):\n", - " base_model = getattr(self, self.base_model_prefix, self)\n", - " if base_model is not self:\n", - " base_model.set_input_embeddings(value)\n", - " else:\n", - " raise NotImplementedError\n", - " \n", - " def get_output_embeddings(self) -> nn.Module:\n", - " return None # Overwrite for models with output embeddings\n", - " \n", - " def tie_weights(self):\n", - " output_embeddings = self.get_output_embeddings()\n", - " if output_embeddings is not None and self.config.tie_word_embeddings:\n", - " self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())\n", - "\n", - " if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:\n", - " if hasattr(self, self.base_model_prefix):\n", - " self = getattr(self, self.base_model_prefix)\n", - " self._tie_encoder_decoder_weights(\n", - " self.encoder, self.decoder, self.base_model_prefix)\n", - " \n", - " @staticmethod\n", - " def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str):\n", - " uninitialized_encoder_weights: List[str] = []\n", - " \"\"\"\n", - " 1. encoder, decoder class가 같은지 체크!\n", - " >> In this case make sure that all encoder weights are correctly initialized.\n", - " 2. weights를 recursively하게 tie\n", - " >> tie_encoder_to_decoder_recursively 함수는 내부에서 구현되어있음\n", - " \"\"\"\n", - " tie_encoder_to_decoder_recursively(\n", - " decoder, encoder, base_model_prefix, uninitialized_encoder_weights)\n", - " \n", - " def _tie_or_clone_weights(self, output_embeddings, input_embeddings):\n", - " \"\"\"\n", - " Tie or clone module weights depending of whether we are using\n", - " TorchScript or not\n", - " \"\"\"\n", - " if self.config.torchscript:\n", - " output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())\n", - " else:\n", - " output_embeddings.weight = input_embeddings.weight\n", - "\n", - " if getattr(output_embeddings, \"bias\", None) is not None:\n", - " output_embeddings.bias.data = torch.nn.functional.pad(\n", - " output_embeddings.bias.data,\n", - " (\n", - " 0,\n", - " output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],\n", - " ),\n", - " \"constant\",\n", - " 0,\n", - " )\n", - " if hasattr(output_embeddings, \"out_features\") and hasattr(input_embeddings, \"num_embeddings\"):\n", - " output_embeddings.out_features = input_embeddings.num_embeddings\n", - " \n", - " def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:\n", - " pass\n", - " \n", - " def _resize_token_embeddings(self, new_num_tokens):\n", - " pass\n", - " \n", - " def _get_resized_embeddings(\n", - " self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None\n", - " ) -> torch.nn.Embedding:\n", - " pass\n", - " \n", - " def _get_resized_lm_head(\n", - " self, old_lm_head: torch.nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False\n", - " ) -> torch.nn.Linear:\n", - " pass\n", - " \n", - " def init_weights(self):\n", - " \"\"\"\n", - " Initializes and prunes weights if needed.\n", - " \"\"\"\n", - " # Initialize weights\n", - " self.apply(self._init_weights)\n", - "\n", - " # Prune heads if needed\n", - " if self.config.pruned_heads:\n", - " self.prune_heads(self.config.pruned_heads)\n", - "\n", - " # Tie weights if needed\n", - " self.tie_weights()\n", - " \n", - " def prune_heads(self, heads_to_prune: Dict[int, List[int]]):\n", - " \"\"\"\n", - " Prunes heads of the base model.\n", - " Arguments:\n", - " heads_to_prune (:obj:`Dict[int, List[int]]`):\n", - " Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of\n", - " heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads\n", - " 0 and 2 on layer 1 and heads 2 and 3 on layer 2.\n", - " \"\"\"\n", - " # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads\n", - " for layer, heads in heads_to_prune.items():\n", - " union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)\n", - " self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON\n", - "\n", - " self.base_model._prune_heads(heads_to_prune)\n", - " \n", - " def save_pretrained(self, save_directory: Union[str, os.PathLike]):\n", - " pass\n", - " \n", - " @classmethod\n", - " def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "from transformers.utils import logging" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": {}, - "outputs": [], - "source": [ - "logger = logging.get_logger(__name__)\n", - "\n", - "logger.info(\n", - " f\"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized.\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "bart.encoder.__class__ != bart.decoder.__class__" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### nn.Module의 apply 메서드" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import TypeVar, Callable\n", - "\n", - "T = TypeVar('T', bound='Module')\n", - "\n", - "def apply(self: T, fn: Callable[['Module'], None]) -> T:\n", - " for module in self.children():\n", - " module.apply(fn)\n", - " fn(self)\n", - " return self" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Encoder, Decoder의 embed_positions\n", - "- forward의 인자가 tensor가 아니라 torch.Size!" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "class BartSinusoidalPositionalEmbedding(nn.Embedding):\n", - " \"\"\"This module produces sinusoidal positional embeddings of any length.\"\"\"\n", - "\n", - " def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):\n", - " super().__init__(num_positions, embedding_dim)\n", - " self.weight = self._init_weight(self.weight)\n", - "\n", - " @staticmethod\n", - " def _init_weight(out: nn.Parameter):\n", - " \"\"\"\n", - " Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. \n", - " The cos features are in the 2nd half of the vector. [dim // 2:]\n", - " \"\"\"\n", - " n_pos, dim = out.shape\n", - " position_enc = np.array(\n", - " [\n", - " [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]\n", - " for pos in range(n_pos)\n", - " ]\n", - " )\n", - " out.requires_grad = False # set early to avoid an error in pytorch-1.8+\n", - " sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1\n", - " out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))\n", - " out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))\n", - " out.detach_()\n", - " return out\n", - "\n", - " @torch.no_grad()\n", - " def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):\n", - " \"\"\"`input_ids_shape` is expected to be [bsz x seqlen].\"\"\"\n", - " bsz, seq_len = input_ids_shape[:2]\n", - " positions = torch.arange(\n", - " past_key_values_length,\n", - " past_key_values_length + seq_len, \n", - " dtype=torch.long, \n", - " device=self.weight.device\n", - " )\n", - " return super().forward(positions)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "$$\\cfrac{pos}{10000^{\\cfrac{2}{d_{model}}}}$$" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "class BartLearnedPositionalEmbedding(nn.Embedding):\n", - " \"\"\"\n", - " This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting\n", - " based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to\n", - " the forward function.\n", - " \"\"\"\n", - "\n", - " def __init__(\n", - " self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset: int\n", - " ):\n", - " # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2\n", - " # and adjust num_embeddings appropriately. Other models dont have this hack\n", - " self.offset = offset\n", - " assert padding_idx is not None, \"`padding_idx` should not be None, but of type int\"\n", - " num_embeddings += offset\n", - " super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)\n", - "\n", - " def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):\n", - " \"\"\"`input_ids_shape` is expected to be [bsz x seqlen].\"\"\"\n", - " bsz, seq_len = input_ids_shape[:2]\n", - " positions = torch.arange(\n", - " past_key_values_length, \n", - " past_key_values_length + seq_len, \n", - " dtype=torch.long, \n", - " device=self.weight.device\n", - " )\n", - " return super().forward(positions + self.offset)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### BartEncoderLayer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class BartEncoderLayer(nn.Module):\n", - " def __init__(self, config: BartConfig):\n", - " super().__init__()\n", - " self.embed_dim = config.d_model\n", - " self.self_attn = BartAttention(\n", - " embed_dim=self.embed_dim,\n", - " num_heads=config.encoder_attention_heads,\n", - " dropout=config.attention_dropout,\n", - " )\n", - " self.normalize_before = config.normalize_before\n", - " self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)\n", - " self.dropout = config.dropout\n", - " self.activation_fn = ACT2FN[config.activation_function]\n", - " self.activation_dropout = config.activation_dropout\n", - " self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)\n", - " self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)\n", - " self.final_layer_norm = BartLayerNorm(self.embed_dim)\n", - "\n", - " def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):\n", - " \"\"\"\n", - " Args:\n", - " hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`\n", - " attention_mask (:obj:`torch.FloatTensor`): attention mask of size\n", - " `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.\n", - " output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.\n", - " \"\"\"\n", - " residual = hidden_states\n", - " if self.normalize_before:\n", - " hidden_states = self.self_attn_layer_norm(hidden_states)\n", - " hidden_states, attn_weights, _ = self.self_attn(\n", - " hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions\n", - " )\n", - " hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n", - " hidden_states = residual + hidden_states\n", - " if not self.normalize_before:\n", - " hidden_states = self.self_attn_layer_norm(hidden_states)\n", - "\n", - " residual = hidden_states\n", - " if self.normalize_before:\n", - " hidden_states = self.final_layer_norm(hidden_states)\n", - " hidden_states = self.activation_fn(self.fc1(hidden_states))\n", - " hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)\n", - " hidden_states = self.fc2(hidden_states)\n", - " hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n", - " hidden_states = residual + hidden_states\n", - " if not self.normalize_before:\n", - " hidden_states = self.final_layer_norm(hidden_states)\n", - " if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():\n", - " clamp_value = torch.finfo(hidden_states.dtype).max - 1000\n", - " hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)\n", - " return hidden_states, attn_weights" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### BartDecoderLayer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class BartDecoderLayer(nn.Module):\n", - " def __init__(self, config: BartConfig):\n", - " super().__init__()\n", - " self.embed_dim = config.d_model\n", - "\n", - " self.self_attn = BartAttention(\n", - " embed_dim=self.embed_dim,\n", - " num_heads=config.decoder_attention_heads,\n", - " dropout=config.attention_dropout,\n", - " is_decoder=True,\n", - " )\n", - " self.dropout = config.dropout\n", - " self.activation_fn = ACT2FN[config.activation_function]\n", - " self.activation_dropout = config.activation_dropout\n", - " self.normalize_before = config.normalize_before\n", - "\n", - " self.self_attn_layer_norm = BartLayerNorm(self.embed_dim)\n", - " self.encoder_attn = BartAttention(\n", - " self.embed_dim,\n", - " config.decoder_attention_heads,\n", - " dropout=config.attention_dropout,\n", - " is_decoder=True,\n", - " )\n", - " self.encoder_attn_layer_norm = BartLayerNorm(self.embed_dim)\n", - " self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)\n", - " self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)\n", - " self.final_layer_norm = BartLayerNorm(self.embed_dim)\n", - "\n", - " def forward(\n", - " self,\n", - " hidden_states: torch.Tensor,\n", - " attention_mask: Optional[torch.Tensor] = None,\n", - " encoder_hidden_states: Optional[torch.Tensor] = None,\n", - " encoder_attention_mask: Optional[torch.Tensor] = None,\n", - " past_key_value: Optional[Tuple[torch.Tensor]] = None,\n", - " output_attentions: Optional[torch.Tensor] = False,\n", - " ):\n", - " \"\"\"\n", - " Args:\n", - " hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`\n", - " attention_mask (:obj:`torch.FloatTensor`): attention mask of size\n", - " `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.\n", - " encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`\n", - " encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size\n", - " `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.\n", - " past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states\n", - " output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.\n", - " \"\"\"\n", - " residual = hidden_states\n", - " if self.normalize_before:\n", - " hidden_states = self.self_attn_layer_norm(hidden_states)\n", - "\n", - " # Self Attention\n", - " # decoder uni-directional self-attention cached key/values tuple is at positions 1,2\n", - " self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None\n", - " # add present self-attn cache to positions 1,2 of present_key_value tuple\n", - " hidden_states, self_attn_weights, present_key_value = self.self_attn(\n", - " hidden_states=hidden_states,\n", - " past_key_value=self_attn_past_key_value,\n", - " attention_mask=attention_mask,\n", - " output_attentions=output_attentions,\n", - " )\n", - " hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n", - " hidden_states = residual + hidden_states\n", - " if not self.normalize_before:\n", - " hidden_states = self.self_attn_layer_norm(hidden_states)\n", - "\n", - " # Cross-Attention Block\n", - " cross_attn_present_key_value = None\n", - " cross_attn_weights = None\n", - " if encoder_hidden_states is not None:\n", - " residual = hidden_states\n", - " if self.normalize_before:\n", - " hidden_states = self.encoder_attn_layer_norm(hidden_states)\n", - "\n", - " # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple\n", - " cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None\n", - " hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(\n", - " hidden_states=hidden_states,\n", - " key_value_states=encoder_hidden_states,\n", - " attention_mask=encoder_attention_mask,\n", - " past_key_value=cross_attn_past_key_value,\n", - " output_attentions=output_attentions,\n", - " )\n", - " hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n", - " hidden_states = residual + hidden_states\n", - " if not self.normalize_before:\n", - " hidden_states = self.encoder_attn_layer_norm(hidden_states)\n", - "\n", - " # add cross-attn to positions 3,4 of present_key_value tuple\n", - " present_key_value = present_key_value + cross_attn_present_key_value\n", - "\n", - " # Fully Connected\n", - " residual = hidden_states\n", - " if self.normalize_before:\n", - " hidden_states = self.final_layer_norm(hidden_states)\n", - " hidden_states = self.activation_fn(self.fc1(hidden_states))\n", - " hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)\n", - " hidden_states = self.fc2(hidden_states)\n", - " hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)\n", - " hidden_states = residual + hidden_states\n", - " if not self.normalize_before:\n", - " hidden_states = self.final_layer_norm(hidden_states)\n", - "\n", - " return (\n", - " hidden_states,\n", - " self_attn_weights,\n", - " present_key_value,\n", - " cross_attn_weights,\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "basic", - "language": "python", - "name": "basic" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/BERT/BERT_ISSUES.md b/BERT/BERT_ISSUES.md deleted file mode 100644 index c0e6a55..0000000 --- a/BERT/BERT_ISSUES.md +++ /dev/null @@ -1,162 +0,0 @@ -# [google-research/bert](https://github.com/google-research/bert/issues)의 Issues 공부 - -## About BERT! -### Paper Detail -- [How is this counted? --> "3.3 billion word corpus"](https://github.com/google-research/bert/issues/1060) -- [How is the number of BERT model parameters calculated?](https://github.com/google-research/bert/issues/656) - -### Implement Detail -- [Question: What does "pooler layer" mean? Why it called pooler?](https://github.com/google-research/bert/issues/1102) -- [Why we get last 4 layers while residual connection transfer useful knowledge to the subsequent layers?](https://github.com/google-research/bert/issues/1064) -- [Infill](https://github.com/google-research/bert/pull/913) (very very 중요!! 코드 구현 어캐했는지 보고 나도 구현하기) - -### Training Detail -- [Explain the variables in the checkpoint](https://github.com/google-research/bert/issues/1019) -- [NotfoundError: Key bert/embeddings/LayerNorm/beta not found in checkpoint](https://github.com/google-research/bert/issues/997) -- [NotFoundError: [_Derived_]No gradient defined for op: Einsum on Tensorflow 1.15](https://github.com/google-research/bert/issues/1012) -- [How to see masked_lm_loss & next_sentence_loss per iteration step during train?](https://github.com/google-research/bert/issues/952) -- [how use the pretrain checkpoint to continue train on my own corpus?](https://github.com/google-research/bert/issues/888) -- [Can BERT really handle misspelled words?](https://github.com/google-research/bert/issues/812) -- [Experiment using RAdam optimizer](https://github.com/google-research/bert/issues/810) -- [Performance metrics of the classifier](https://github.com/google-research/bert/issues/800) -- [Tutorial: A Pipeline Of Pretraining Bert On Google TPU](https://github.com/google-research/bert/issues/681) -- [Determining training steps](https://github.com/google-research/bert/issues/662) -- [Learning Rate and Warmup Steps](https://github.com/google-research/bert/issues/649) -- [How to freeze layers of bert?](https://github.com/google-research/bert/issues/637) -- [How often is the validation/evaluation performed? (fine-tuning using run_classifier.py)](https://github.com/google-research/bert/issues/636) -- [How to get masked word prediction probabilities](https://github.com/google-research/bert/issues/608) - -## Tokenization -- [How to handle labels when using the BERT wordpiece tokenizer](https://github.com/google-research/bert/issues/646) -- [Tokenization behavior with messed-up unicode characters](https://github.com/google-research/bert/issues/1093) -- [x] [fix korean tokenization bug](https://github.com/google-research/bert/pull/1070) -- [x] [Update tokenization](https://github.com/google-research/bert/pull/1042) -- [WordPiece Tokenizer Clarification](https://github.com/google-research/bert/issues/763) - -#### Vocabulary (답변 없는 경우 많음) -- [use custom vocab.txt](https://github.com/google-research/bert/issues/1092) -- [Adding custom domain words and abbreviations to vocab.txt](https://github.com/google-research/bert/issues/1083) -- [update load_vocab() function based on ALBERT](https://github.com/google-research/bert/pull/961) -- [Language dependent vocabulary?](https://github.com/google-research/bert/issues/641) - -#### Embedding -- [What does bert embedding of a single term signify?](https://github.com/google-research/bert/issues/990) -- [Bert sent embeddings](https://github.com/google-research/bert/pull/691) - -## Bert Pre-Training -- [Pretraining BERT without next sentence prediction](https://github.com/google-research/bert/issues/178) -- [Using my pre-trained model](https://github.com/google-research/bert/issues/1040) -- [BERT pretraining num_train_steps questions](https://github.com/google-research/bert/issues/1025) -- [BERT pre-training using only domain specific text](https://github.com/google-research/bert/issues/615) - -## Bert Fine-Tuning -- [Does bert have this function ?](https://github.com/google-research/bert/issues/1024) (Bert for LM) -- [Is it possible feed BERT to seq2seq encoder for NMT (for low resource language)?](https://github.com/google-research/bert/issues/1007) (답없음) -- [extract_features sentence embedding BERT](https://github.com/google-research/bert/issues/1085) -- [How does Google calculate a document embeddings using BERT in its new search?](https://github.com/google-research/bert/issues/957) (Fine-tune이라기 보단 feature-based일거 같지만... Google Search에서 어떻게 활용할지?) -- [Exporting bert model to a saved model format](https://github.com/google-research/bert/issues/843) (tf serving) -- [How to use BERT for ranking with Pairwise loss function during Finetuining](https://github.com/google-research/bert/issues/761) -- [Serving fine-tuned Model - best solution](https://github.com/google-research/bert/issues/755) (bert in flask) -- [Sentiment analysis on emoji data.](https://github.com/google-research/bert/issues/748) -- [how to fine tune bert for ner on custom data](https://github.com/google-research/bert/issues/713) -- [Tuned Bert Model on MRPC gives wrong predictions.](https://github.com/google-research/bert/issues/663) -- [How to use run_squad.py to produce multiple answers for a question?](https://github.com/google-research/bert/issues/657) -- [Using bert for Document Classification](https://github.com/google-research/bert/issues/650) -- [Losing Knowledge for Language Model in Fine-Tuning](https://github.com/google-research/bert/issues/651) -- [how to use BERT for Siamese Model paraphrase identify](https://github.com/google-research/bert/issues/648) -- [Classification fine tuning for Q & A](https://github.com/google-research/bert/issues/639) - -## Distributed Training -- [Exploding gradients in training BERT from scratch](https://github.com/google-research/bert/issues/1016) -- [Can I run multi-gpu pretraining?](https://github.com/google-research/bert/issues/978) -- [x] [Gradient Accumulation](https://github.com/google-research/bert/pull/976) -- [multi-gpu horovod](https://github.com/google-research/bert/issues/743) -- [Recommended GPU size when training BERT-base](https://github.com/google-research/bert/issues/645) - -## Open Issues -- [How to create two BERT model with shared weights?](https://github.com/google-research/bert/issues/605) -- [How to train our own domain-specific data instead of using pre-training models?](https://github.com/google-research/bert/issues/606) -- [create_pretraining_data.py generates tfrecords that are too big](https://github.com/google-research/bert/issues/1161) -- [How can i use BERT to correct the alignment and spellings in a sentence?](https://github.com/google-research/bert/issues/1154) -- [Update the number of parameters](https://github.com/google-research/bert/pull/1150) -- [Incomplete feature vectors generated by Bert model.](https://github.com/google-research/bert/issues/1145) -- [Update tokenization.py](https://github.com/google-research/bert/pull/1117) -- [Dealing with ellipses in BERT tokenization](https://github.com/google-research/bert/issues/1116) -- [A spelling error is fixed](https://github.com/google-research/bert/pull/1168) -- [How to use my own additional vocabulary dictionary?](https://github.com/google-research/bert/issues/396) -- [Is there a plan to release code for fine-tuning on CoQA dataset?](https://github.com/google-research/bert/issues/597) -- [How to use my own vocabulary when do pre-training from scratch?](https://github.com/google-research/bert/issues/589) -- [BERT has a non deterministic behaviour](https://github.com/google-research/bert/issues/583) -- [how to use bert to text summary](https://github.com/google-research/bert/issues/576) -- [BERT multilingual for zero-shot classification](https://github.com/google-research/bert/issues/577) -- [BERT encode emojis as [UNK] token](https://github.com/google-research/bert/issues/587) -- [How to use BERT for sequence labelling](https://github.com/google-research/bert/issues/569) -- [Added support for multi gpu training and distributed training using Horovod](https://github.com/google-research/bert/pull/568) -- [How many articles (Wiki+Book corpus) do Bert use in pretraining?](https://github.com/google-research/bert/issues/570) -- [Problem with wordpiece tokenization](https://github.com/google-research/bert/issues/560) -- [problem multiclass text classification](https://github.com/google-research/bert/issues/559) -- [IndexError in run_classifier.py::MrpcProcessor::_create_examples (2)](https://github.com/google-research/bert/issues/551) -- [bad results after pretraining](https://github.com/google-research/bert/issues/529) -- [Is BERT a kind of cheating?](https://github.com/google-research/bert/issues/514) -- [Fixing normalized korean char](https://github.com/google-research/bert/pull/512) -- [Are BERT word-embeddings capable of synonyms?](https://github.com/google-research/bert/issues/507) -- [How to share BERT between tasks in multi-task setting?](https://github.com/google-research/bert/issues/504) -- [add regression fine-tuning](https://github.com/google-research/bert/pull/503) -- [Pre-trained monolingual in French](https://github.com/google-research/bert/issues/502) -- [what is the synthetic self-training](https://github.com/google-research/bert/issues/488) -- [Fine-Tune encodings on unsupervised data?](https://github.com/google-research/bert/issues/448) -- [Using BERT with custom QA dataset](https://github.com/google-research/bert/issues/411) -- [How can I change vocab size for pretrained model?](https://github.com/google-research/bert/issues/406) -- [How to use my own additional vocabulary dictionary?](https://github.com/google-research/bert/issues/396) -- [Can I use a "[CLS]...[SEP]...[SEP]...[SEP]" in tokens?](https://github.com/google-research/bert/issues/395) -- [Weights from next sentence prediction](https://github.com/google-research/bert/issues/370) -- [Optimize the code logic](https://github.com/google-research/bert/pull/366) -- [BERT vs Word2vec](https://github.com/google-research/bert/issues/362) -- [BERT for text summarization](https://github.com/google-research/bert/issues/352) -- [Wiki Data Formation Problem, Need Sentence Split](https://github.com/google-research/bert/issues/341) -- [how use BERT language model to predict next word](https://github.com/google-research/bert/issues/323) -- [how to get fine_tune model output probability](https://github.com/google-research/bert/issues/322) -- [how the model reflect 'bidirectional'?](https://github.com/google-research/bert/issues/319) -- [Extract features return different layer values (vectors) each time, is it working well?](https://github.com/google-research/bert/issues/312) -- [Is BERT powerful enough to learn sentence embedding and word embedding?](https://github.com/google-research/bert/issues/261) -- [Gpu optimizations](https://github.com/google-research/bert/pull/255) -- [Use BERT fine-tuned model for Tensorflow serving](https://github.com/google-research/bert/issues/146) -- [What is BERT?](https://github.com/google-research/bert/issues/566) -- [BERT with FP16 and XLA inference speed](https://github.com/google-research/bert/issues/391) - -## Closed Issues -- [zero-shot for IsNext and NotNext function](https://github.com/google-research/bert/issues/1118) -- [I don't know how to properly use fine tuned Bert Model](https://github.com/google-research/bert/issues/1097) -- [why dropout at predicting time](https://github.com/google-research/bert/issues/1096) -- [LayerNorm normalises the batch dimension as well](https://github.com/google-research/bert/issues/1088) -- [MRPC Produces Two Vastly Different Eval Accuracy](https://github.com/google-research/bert/issues/1037) -- [bert run_classifier](https://github.com/google-research/bert/issues/989) -- [how to realize the tokenization of BERT model in c++](https://github.com/google-research/bert/issues/878) -- [how to infer in python](https://github.com/google-research/bert/issues/614) -- [Bert Context Based QA](https://github.com/google-research/bert/issues/620) -- [Best performance on concatenated layers: which dimension?](https://github.com/google-research/bert/issues/511) -- [Issue with multiclass text classification](https://github.com/google-research/bert/issues/449) -- [What is exactly the learning rate warmup described in the paper?](https://github.com/google-research/bert/issues/425) -- [Fine-Tuning specifications for MNLI/XNLI](https://github.com/google-research/bert/issues/328) -- [fine-tuning with additional masked lm loss, and masked lm loss diverged](https://github.com/google-research/bert/issues/306) -- [Handling domain specific vocabulary](https://github.com/google-research/bert/issues/237) -- [Can you release the hyper-parameter of NER task?](https://github.com/google-research/bert/issues/223) -- [Question about mask strategy.](https://github.com/google-research/bert/issues/169) -- [Is CLS token also Masked in pre-training?](https://github.com/google-research/bert/issues/166) -- [BERT Vector Space shows issues with unknown words](https://github.com/google-research/bert/issues/164) -- [Simplifying BERT for Q&A - One paragraph and Query](https://github.com/google-research/bert/issues/159) -- [Reproducing paper results from feature vectors (STS-B dataset)](https://github.com/google-research/bert/issues/161) -- [Fine tuning BERT to extract embeddings (like ELMo)](https://github.com/google-research/bert/issues/145) -- [Classification quality is depends on max_sequence_length](https://github.com/google-research/bert/issues/113) -- [fine-tuned for a document task](https://github.com/google-research/bert/issues/107) -- [When to stop training? What is a good valid loss value to stop ? How to improve classification performance?](https://github.com/google-research/bert/issues/95) -- [plan to release SWAG code?](https://github.com/google-research/bert/issues/38) -- [Add flag to extract only features for the [CLS] token](https://github.com/google-research/bert/pull/87) -- [run_pretraining.py - clip gradient error: Found Inf or NaN global norm: Tensor had NaN value](https://github.com/google-research/bert/issues/82) -- [How to train models on GPU instead of CPU when TPU is not available?](https://github.com/google-research/bert/issues/75) -- [how to see loss per steps or epoch during train?](https://github.com/google-research/bert/issues/70) -- [Extracting features on for long sequences / SQuAD](https://github.com/google-research/bert/issues/66) -- [Trouble to understand position embedding.](https://github.com/google-research/bert/issues/58) -- [PyTorch implementation](https://github.com/google-research/bert/issues/54) -- [Plans to release sequence tagging task fine-tuning code?](https://github.com/google-research/bert/issues/33) -- [w to get the word embedding after pre-training?](https://github.com/google-research/bert/issues/60) diff --git a/BERT/README.md b/BERT/README.md deleted file mode 100644 index 064f194..0000000 --- a/BERT/README.md +++ /dev/null @@ -1 +0,0 @@ -# BERT Implementation with PyTorch diff --git a/ETRI_KorBERT.md b/ETRI_KorBERT.md deleted file mode 100644 index dffc3ce..0000000 --- a/ETRI_KorBERT.md +++ /dev/null @@ -1,224 +0,0 @@ -# ETRI KorBERT 한국어 embedding 사용하기 및 적용 예시 :) -2주 동안 source code 하나하나 뜯어가며 삽질한 노고를 기록하고 BERT에서 해당 코드가 어떠한 역할을 하는지 논문과 비교하며 설명! - -ETRI KorBERT로 이미 많은 분들이 활용하고 계시지만 친절하게 코드 하나하나 어떻게 해야한다는 연습 예제는 없더라! - -내가 공부한 내용을 공유하며 차근차근 예시 문제를 풀며 한국어 형태소 분석을 기반으로 하는 BERT를 활용하는 것이 이 repo의 목적이다! - -### Requirements -- Tensorflow 1.15.0 - - `huggingface`의 pytorch transformer 모델과 `google research`의 tensorflow 모델 둘 다 지원하지만 - - 저는 google research의 `tensorflow` 버전을 활용하고 버전은 2.0 이전 버전 중 가장 최신 버전인 1.15.0 사용 -- ETRI에서 제공하는 model ckpt(checkpoint)와 vocab list - - 이는 저작권 상 Git에 올릴 수 없으니 **아래 ETRI 홈페이지에서 직접 openapi를 활용하여 받도록 한다.** - - [ETRI 학습 모델 및 데이터 제공](http://aiopen.etri.re.kr/service_dataset.php) - - ETRI에서 제공하는 버전은 총 4개이다. - ``` - 1. Pytorch + Morphology - 2. Tensorflow + Morphology - 3. Pytorch + Eojeol - 4. Tensorflow + Eojeol - ``` - - 형태소와 어절은 input을 형태소 분석을 하고 넣어줄 것인지, 아니면 pure text 자체를 넣어줄 것인지 여부의 차이만 존재할 뿐, 큰 차이가 없다. - - 중요한 것은 **사용하는 형태소 분석기는 TTA 표준 형태소 태그셋(TTAK.KO-11.0010/R1)에 호환되는 형태소분석기 사용**이 필요하다. - - [한국정보통신기술협회(Telecommuication Technology Association, TTA) 형태소 태그셋](http://aiopen.etri.re.kr/data/001.형태소분석_가이드라인.pdf) - - 이를 만족하는 형태소 분석기는 카카오 팀의 `khaiii`와 `ETRI`에서 제공하는 형태소 분석기가 존재한다. - - `konlpy.tag.Komoran`도 가능한지는 살펴봐야겠다. 형태소 분석 성능 면에서는 `ETRI`에서 제공하는 형태소 분석기가 더 좋았다. - - `khaiii`는 Window 환경에서 사용 불가능하다. Docker로 하는 방법 밖에는 없다. - - [khaiii docker 파일](https://github.com/kakao/khaiii/tree/master/docker) - - [Docker를 활용한 khaiii 설치수난기](https://medium.com/@saerombang11/docker를-활용한-khaiii-설치수난기-53d014f9eb58) -- Python 3.7 -- 그 외 python library - - `six, numpy, scikit-learn, urllib3, urllib, pandas, konlpy, chatspace, pytorch` - -### Example DataSets -- [Dacon 금융문자 분석 경진대회](https://dacon.io/cpt14) - - KB 금융그룹에서 제공한 금융문자가 스미싱인지 vs 아닌지 이진분류를 수행하는 task - - 한국어 text 20만 건 이상 존재하고 있음 - -- [Naver Sentiment Movie Corpus](https://github.com/e9t/nsmc) - - 73만 건의 naver movie reviews 데이터를 크롤링한 데이터 - - rating 기반으로 긍/부정의 극성 분류를 시도 - -- [AI Hub DataSets](http://www.aihub.or.kr/) - - 적용 예정 - -### Appendix -- TTAK.KO-11.0010/R1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
대분류 - 중분류 - 대분류 -
(1) 체언명사일반명사(NNG)
고유명사(NNP)
의존명사(NNB)
대명사(NP)대명사(NP)
수사(NR)수사(NR)
(2) 용언동사(VV)동사(VV)
형용사(VA)형용사(VA)
보조용언(VX)보조용언(VX)
지정사(VC)긍정지정사(VCP)
부정지정사(VCN)
(3) 수식언관형사(MM)성상 관형사(MMA)
지시 관형사(MMD)
수 관형사(MMN)
부사(MA)일반부사(MAG)
접속부사(MAJ)
(4) 독립언감탄사(IC)감탄사(IC)
(5) 관계언격조사(JK)주격조사(JKS)
보격조사(JKC)
관형격조사(JKG)
목적격조사(JKO)
부사격조사(JKB)
호격조사(JKV)
인용격조사(JKQ)
보조사(JX)보조사(JX)
접속조사(JC)접속조사(JC)
(6) 의존형태어미(EM)선어말어미(EP)
종결어미(EF)
연결어미(EC)
명사형전성어미(ETN)
관형형전성어미(ETM)
접두사(XP)체언접두사(XPN)
접미사(XS)명사파생접미사(XSN)
동사파생접미사(XSV)
형용사파생접미사(XSA)
어근(XR)어근(XR)
(7) 기초일반기호(ST)마침표, 물음표, 느낌표(SF)
쉼표, 가운뎃점, 콜론, 빗금(SP)
따옴표, 괄호표, 줄표(SS)
줄임표(SE)
붙임표(물결)(SO)
기타 기호(SW)
외국어(SL)외국어(SL)
한자(SH)한자(SH)
숫자(SN)숫자(SN)
분석불능범주(NA)분석불능범주(NA)
- diff --git a/Hangul_Analyzer.py b/Hangul_Analyzer.py deleted file mode 100644 index a0c5de5..0000000 --- a/Hangul_Analyzer.py +++ /dev/null @@ -1,97 +0,0 @@ -import urllib3 -import logging -import json -logger = logging.getLogger(__name__) -from getpass import getpass - -from khaiii import KhaiiiApi -from konlpy.tag import Komoran -# from konlpy.tag import Mecab - -# If you use windows, try this. -# !pip install eunjeon -from eunjeon import Mecab - -# ETRI 형태소 분석기 -class ETRIMorphology: - - def __init__(self): - self.openapiKey = self.get_apikey() - self.url = "http://aiopen.etri.re.kr:8000/WiseNLU" - self.requestJson = {"access_key": self.openapiKey, - "argument": {"text": None, "analysis_code": "morp"}} - self.http = urllib3.PoolManager() - - @staticmethod - def get_apikey(): - openapikey = getpass('Type OpenAPI Key :') - return openapikey - - @staticmethod - def _try_connect(openApiURL, requestJson): - response = self.http.request( - "POST", openApiURL, - headers={"Content-Type": "application/json; charset=UTF-8"}, - body=json.dumps(requestJson)) - return response - - @staticmethod - def _get_json_result(response): - json_data = json.loads(response.data.decode('utf-8')) - return json_data - - @staticmethod - def _check_valid_connect(json_data): - if json_data['result'] == -1: - if 'Invalid Access Key' in json_data['reason']: - logger.info(json_reason) - logger.info('Please check the openapi access key.') - sys.exit() - return "openapi error - " + json_reason - else: - return True - - def do_lang(self, text): - self.requestJson['argument']['text'] = text - response = self._try_connect(self.url, self.requestJson) - json_data = self._get_json_result(response) - res = self._check_valid_connect(json_data) - if not res: - print(res) - return None - else: - json_return_obj = json_data['return_object'] - return_result = "" - json_sentence = json_return_obj['sentence'] - for json_morp in json_sentence: - for morp in json_morp['morp']: - return_result += str(morp['lemma']) + '/' + str(morp['type']) + " " - return return_result[:-1] - -def Analyze(self, text, SEP=' + '): - """ - KhaiiiApi의 분석 결과를 보기좋게 돌려주는 method - - USAGE; - ```python - from khaiii import KhaiiiApi - khai3 = KhaiiiApi() - khai3.analyze('아버지가방에들어가신다 왜 자꾸 거리감들게할까 내 성격 리얼...') - >>> [, - >>> , - >>> , - >>> , - >>> , - >>> , - >>> ] - - setattr(khai3.__class__, 'Analyze', Analyze) - khai3.Analze('아버지가방에들어가신다 왜 자꾸 거리감들게할까 내 성격 리얼...') - >>> '아버지/NNG + 가/JKS + 방/NNG + 에/JKB + 들어가/VV + 시/EP + ㄴ다/EC + - 왜/MAG + 자꾸/MAG + 거리감/NNG + 들/VV + 게/EC + 하/VV + ㄹ까/EC + - 나/NP + 의/JKG + 성격/NNG + 리/NNG + 얼/IC + ../SE + ./SF' - ``` - """ - res = self.analyze(text) - f = lambda x: x.__str__().split('\t')[1] - return SEP.join(list(map(f, res))) diff --git a/README.md b/README.md index 248fc79..272076f 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,8 @@ -# Korean Transformers -- Transformers에 관련된 개념들 구현 레포 +# Advanced Transformers -## 🤗 transformers에서 사용되는 특별한 기술들 -- gradient checkpoint -- reversible residual connection -- dynamic padding -- chunk feed forward network -- 3d, 4d multi-head scaled dot product attention -- past key value -- various positional embedding -- various heads -- porting script -- generation mixin -- parrallelism mixin -- pushtohub mixin -- how to make tokenization script? -- trainer -- various utils +- WIP - -## Reference - -### BPE -- [A New Algorithm for Data Compression](https://www.derczynski.com/papers/archive/BPE_Gage.pdf) - -### Wordpiece -- [Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation](https://arxiv.org/abs/1609.08144) - -### Transformers -- [Attention Is All You Need](https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf) +## 프로젝트 목표 +- transformers v4.20.0 기준 131개 모델에 대한 공부 실시 +- 중복된 component들을 일반화하고 다양한 component들에 대한 공부 실시 +- 튜토리얼 자료를 만들어서 오픈 커뮤니티에 공개 (131개 모델들) \ No newline at end of file diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index cd9e2b5..0000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,32 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "basic", - "language": "python", - "name": "basic" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/preprocessing_bert.ipynb b/preprocessing_bert.ipynb deleted file mode 100644 index 727417c..0000000 --- a/preprocessing_bert.ipynb +++ /dev/null @@ -1,3788 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow as tf" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from utils_20191230 import *" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "tf.logging.set_verbosity(tf.logging.INFO)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### ETRI BERT Config\n", - "```python\n", - "bert_config = {\n", - " \"attention_probs_dropout_prob\": 0.1, \n", - " \"directionality\": \"bidi\", \n", - " \"hidden_act\": \"gelu\", \n", - " \"hidden_dropout_prob\": 0.1, \n", - " \"hidden_size\": 768, \n", - " \"initializer_range\": 0.02, \n", - " \"intermediate_size\": 3072, \n", - " \"max_position_embeddings\": 512, \n", - " \"num_attention_heads\": 12, \n", - " \"num_hidden_layers\": 12, \n", - " \"pooler_fc_size\": 768, \n", - " \"pooler_num_attention_heads\": 12, \n", - " \"pooler_num_fc_layers\": 3, \n", - " \"pooler_size_per_head\": 128, \n", - " \"pooler_type\": \"first_token_transform\", \n", - " \"type_vocab_size\": 2, \n", - " \"vocab_size\": 30349\n", - "}\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# load ETRI Bert Config\n", - "path = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'\n", - "FLAGS.bert_config_file = path + 'bert_config.json'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "bert_config = BertConfig.from_json_file(FLAGS.bert_config_file)\n", - "# bert_config = BertConfig.from_dict(bert_config) # 위의 dictionary를 메모리에 올려서 \n", - " # 다음 메서드로 호출하는 것도 가능하다." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " (FLAGS) MAX_SEQ_LENGTH : 128\n", - "(BERTConfig) MAX_POSITION_EMBEDDINGS : 512\n" - ] - } - ], - "source": [ - "if FLAGS.max_seq_length > bert_config.max_position_embeddings:\n", - " raise ValueError(\n", - " \"Cannot use sequence length %d because the BERT model \"\n", - " \"was only trained up to sequence length %d\" %\n", - " (FLAGS.max_seq_length, bert_config.max_position_embeddings))\n", - "else:\n", - " print(' (FLAGS) MAX_SEQ_LENGTH :', FLAGS.max_seq_length)\n", - " print('(BERTConfig) MAX_POSITION_EMBEDDINGS :', bert_config.max_position_embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Do not use TPU\n" - ] - } - ], - "source": [ - "# do not use tpu\n", - "tpu_cluster_resolver = None\n", - "if FLAGS.use_tpu and FLAGS.tpu_name:\n", - " tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(\n", - " FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)\n", - "else:\n", - " print('Do not use TPU')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:\n", - "The TensorFlow contrib module will not be included in TensorFlow 2.0.\n", - "For more information, please see:\n", - " * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n", - " * https://github.com/tensorflow/addons\n", - " * https://github.com/tensorflow/io (for I/O related ops)\n", - "If you depend on functionality not listed there, please file an issue.\n", - "\n" - ] - } - ], - "source": [ - "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n", - "run_config = tf.contrib.tpu.RunConfig(\n", - " cluster=tpu_cluster_resolver,\n", - " master=FLAGS.master,\n", - " model_dir=FLAGS.output_dir,\n", - " save_checkpoints_steps=FLAGS.save_checkpoints_steps, # 1000\n", - " tpu_config=tf.contrib.tpu.TPUConfig(\n", - " iterations_per_loop=FLAGS.iterations_per_loop, # 1000\n", - " num_shards=FLAGS.num_tpu_cores, # 8\n", - " per_host_input_for_training=is_per_host)\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# tensorflow gpu 사용 가능한지 체크\n", - "tf.test.is_gpu_available()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# 한국어 vocab 사전을 등록\n", - "FLAGS.vocab_file = path + 'vocab.korean_morp.list' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "간략한 파일 준비" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "dacon_path = '../dacon문자스미싱/filedown (2)/'\n", - "df_train = pd.read_csv(dacon_path + 'train.csv')\n", - "df_test = pd.read_csv(dacon_path + 'public_test.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((236756, 2), (59189, 2), (236756,), (59189,))" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "df_train = df_train.set_index('id')\n", - "df_test = df_test.set_index('id')\n", - "\n", - "X_train, X_valid, y_train, y_valid = train_test_split(\n", - " df_train[[col for col in df_train.columns if col != 'smishing']], \n", - " df_train['smishing'],\n", - " random_state=42, test_size=.2,\n", - " stratify=df_train['smishing'])\n", - "X_train.shape, X_valid.shape, y_train.shape, y_valid.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "df_train = pd.concat((X_train, y_train), axis=1)\n", - "df_valid = pd.concat((X_valid, y_valid), axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# sample 100개씩 뽑아서 미리 test\n", - "df_train.sample(100).to_csv(dacon_path + 'train_100.tsv', sep='\\t')\n", - "df_valid.sample(100).to_csv(dacon_path + 'dev_100.tsv', sep='\\t')\n", - "df_test.sample(100).to_csv(dacon_path + 'test_100.tsv', sep='\\t')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# DataProcessor 제작\n", - "class SmishingProcessor(DataProcessor):\n", - "\n", - " def get_train_examples(self, data_dir, filename='train.tsv'):\n", - " return self._create_examples(\n", - " self._read_tsv(os.path.join(data_dir, filename)), \"train\")\n", - "\n", - " def get_dev_examples(self, data_dir, filename='dev.tsv'):\n", - " return self._create_examples(\n", - " self._read_tsv(os.path.join(data_dir, filename)), \"dev\")\n", - "\n", - " def get_test_examples(self, data_dir, filename='test.tsv'):\n", - " return self._create_examples(\n", - " self._read_tsv(os.path.join(data_dir, filename)), \"test\")\n", - "\n", - " def get_labels(self):\n", - " return [\"0\", \"1\"]\n", - "\n", - " def _create_examples(self, lines, set_type):\n", - " examples = []\n", - " for (i, line) in enumerate(lines):\n", - " if i == 0:\n", - " continue\n", - " guid = \"%s-%s\" % (set_type, i)\n", - " text_a = convert_to_unicode(line[2])\n", - " if set_type == \"test\":\n", - " label = \"0\"\n", - " else:\n", - " label = convert_to_unicode(line[-1])\n", - " examples.append(\n", - " InputExample(guid=guid, text_a=text_a, label=label))\n", - " return examples" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "processor = SmishingProcessor()\n", - "label_list = processor.get_labels()\n", - "\n", - "# get train samples\n", - "train_examples = processor.get_train_examples(dacon_path, 'train_100.tsv')\n", - "num_train_steps = int(\n", - " len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)\n", - "num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "# record ETRI model weights\n", - "FLAGS.init_checkpoint = path + 'model.ckpt'" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "model_fn = model_fn_builder(\n", - " bert_config=bert_config,\n", - " num_labels=len(label_list), # 2\n", - " init_checkpoint=FLAGS.init_checkpoint, # None\n", - " learning_rate=FLAGS.learning_rate, # 5e-05\n", - " num_train_steps=num_train_steps, # 22195\n", - " num_warmup_steps=num_warmup_steps, # 2219\n", - " use_tpu=FLAGS.use_tpu, # False\n", - " use_one_hot_embeddings=FLAGS.use_tpu) # False" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:Estimator's model_fn (.model_fn at 0x000002196B19C0D0>) includes params argument, but params are not passed to Estimator.\n", - "WARNING:tensorflow:Using temporary folder as model directory: C:\\Users\\jinma\\AppData\\Local\\Temp\\tmp95ip6j47\n", - "INFO:tensorflow:Using config: {'_model_dir': 'C:\\\\Users\\\\jinma\\\\AppData\\\\Local\\\\Temp\\\\tmp95ip6j47', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n", - "graph_options {\n", - " rewrite_options {\n", - " meta_optimizer_iterations: ONE\n", - " }\n", - "}\n", - ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None, eval_training_input_configuration=2, experimental_host_call_every_n_steps=1), '_cluster': None}\n", - "INFO:tensorflow:_TPUContext: eval_on_tpu True\n", - "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n" - ] - } - ], - "source": [ - "# If TPU is not available, this will fall back to normal Estimator on CPU\n", - "# or GPU\n", - "estimator = tf.contrib.tpu.TPUEstimator(\n", - " use_tpu=FLAGS.use_tpu, # False\n", - " model_fn=model_fn,\n", - " config=run_config,\n", - " train_batch_size=FLAGS.train_batch_size, # 32\n", - " eval_batch_size=FLAGS.eval_batch_size, # 8\n", - " predict_batch_size=FLAGS.predict_batch_size # 8\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "FLAGS.output_dir = './output_dir/smishing/'" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "tf.gfile.MakeDirs(FLAGS.output_dir)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Embedding\n", - "- TTA 표준 형태소 태그셋(TTAK.KO-11.0010/R1)에 맞는 형태소 분석기를 사용해야 함." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
대분류\n", - " 중분류\n", - " 대분류\n", - "
(1) 체언명사일반명사(NNG)
고유명사(NNP)
의존명사(NNB)
대명사(NP)대명사(NP)
수사(NR)수사(NR)
(2) 용언동사(VV)동사(VV)
형용사(VA)형용사(VA)
보조용언(VX)보조용언(VX)
지정사(VC)긍정지정사(VCP)
부정지정사(VCN)
(3) 수식언관형사(MM)성상 관형사(MMA)
지시 관형사(MMD)
수 관형사(MMN)
부사(MA)일반부사(MAG)
접속부사(MAJ)
(4) 독립언감탄사(IC)감탄사(IC)
(5) 관계언격조사(JK)주격조사(JKS)
보격조사(JKC)
관형격조사(JKG)
목적격조사(JKO)
부사격조사(JKB)
호격조사(JKV)
인용격조사(JKQ)
보조사(JX)보조사(JX)
접속조사(JC)접속조사(JC)
(6) 의존형태어미(EM)선어말어미(EP)
종결어미(EF)
연결어미(EC)
명사형전성어미(ETN)
관형형전성어미(ETM)
접두사(XP)체언접두사(XPN)
접미사(XS)명사파생접미사(XSN)
동사파생접미사(XSV)
형용사파생접미사(XSA)
어근(XR)어근(XR)
(7) 기초일반기호(ST)마침표, 물음표, 느낌표(SF)
쉼표, 가운뎃점, 콜론, 빗금(SP)
따옴표, 괄호표, 줄표(SS)
줄임표(SE)
붙임표(물결)(SO)
기타 기호(SW)
외국어(SL)외국어(SL)
한자(SH)한자(SH)
숫자(SN)숫자(SN)
분석불능범주(NA)분석불능범주(NA)
" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "# TTA 가이드라인\n", - "# http://aiopen.etri.re.kr/data/001.형태소분석_가이드라인.pdf\n", - "tta_guide = {\n", - " '체언': {\n", - " '명사': ('NN',\n", - " {'일반명사': 'NNG',\n", - " '고유명사': 'NNP',\n", - " '의존명사': 'NNB'}),\n", - " '대명사': ('NP',{'대명사': 'NP'}),\n", - " '수사': ('NR', {'수사': 'NR'})\n", - " },\n", - " '용언': {\n", - " '동사': ('VV', {'동사': 'VV'}),\n", - " '형용사': ('VA', {'형용사': 'VA'}),\n", - " '보조용언': ('VX', {'보조용언': 'VX'}),\n", - " '지정사': ('VC', \n", - " {'긍정지정사': 'VCP',\n", - " '부정지정사': 'VCN'})\n", - " },\n", - " '수식언': {\n", - " '관형사': ('MM', \n", - " {'성상 관형사': 'MMA',\n", - " '지시 관형사': 'MMD',\n", - " '수 관형사': 'MMN'}),\n", - " '부사': ('MA', \n", - " {'일반부사': 'MAG',\n", - " '접속부사': 'MAJ'})\n", - " },\n", - " '독립언': {\n", - " '감탄사': ('IC', {'감탄사': 'IC'})\n", - " },\n", - " '관계언': {\n", - " '격조사': ('JK', \n", - " {'주격조사': 'JKS',\n", - " '보격조사': 'JKC',\n", - " '관형격조사': 'JKG',\n", - " '목적격조사': 'JKO',\n", - " '부사격조사': 'JKB',\n", - " '호격조사': 'JKV',\n", - " '인용격조사': 'JKQ'}),\n", - " '보조사': ('JX', {'보조사': 'JK'}),\n", - " '접속조사': ('JC', {'접속조사': 'JC'})\n", - " },\n", - " '의존형태': {\n", - " '어미': ('EM', \n", - " {'선어말어미': 'EP',\n", - " '종결어미': 'EF',\n", - " '연결어미': 'EC',\n", - " '명사형전성어미': 'ETN',\n", - " '관형형전성어미': 'ETM'}),\n", - " '접두사': ('XP', {'체언접두사': 'XPN'}),\n", - " '접미사': ('XS', \n", - " {'명사파생접미사': 'XSN',\n", - " '동사파생접미사': 'XSV',\n", - " '형용사파생접미사': 'XSA'}),\n", - " '어근': ('XR', {'어근': 'XR'})\n", - " },\n", - " '기호': {\n", - " '일반기호': ('ST', \n", - " {'마침표, 물음표, 느낌표': 'SF',\n", - " '쉼표, 가운뎃점, 콜론, 빗금': 'SP',\n", - " '따옴표, 괄호표, 줄표': 'SS',\n", - " '줄임표': 'SE',\n", - " '붙임표(물결)': 'SO',\n", - " '기타 기호': 'SW'}),\n", - " '외국어': ('SL', {'외국어': 'SL'}),\n", - " '한자': ('SH', {'한자': 'SH'}),\n", - " '숫자': ('SN', {'숫자': 'SN'}),\n", - " '분석불능범주': ('NA', {'분석불능범주': 'NA'})\n", - " }\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'체언': {'명사': ('NN', {'일반명사': 'NNG', '고유명사': 'NNP', '의존명사': 'NNB'}),\n", - " '대명사': ('NP', {'대명사': 'NP'}),\n", - " '수사': ('NR', {'수사': 'NR'})},\n", - " '용언': {'동사': ('VV', {'동사': 'VV'}),\n", - " '형용사': ('VA', {'형용사': 'VA'}),\n", - " '보조용언': ('VX', {'보조용언': 'VX'}),\n", - " '지정사': ('VC', {'긍정지정사': 'VCP', '부정지정사': 'VCN'})},\n", - " '수식언': {'관형사': ('MM', {'성상 관형사': 'MMA', '지시 관형사': 'MMD', '수 관형사': 'MMN'}),\n", - " '부사': ('MA', {'일반부사': 'MAG', '접속부사': 'MAJ'})},\n", - " '독립언': {'감탄사': ('IC', {'감탄사': 'IC'})},\n", - " '관계언': {'격조사': ('JK',\n", - " {'주격조사': 'JKS',\n", - " '보격조사': 'JKC',\n", - " '관형격조사': 'JKG',\n", - " '목적격조사': 'JKO',\n", - " '부사격조사': 'JKB',\n", - " '호격조사': 'JKV',\n", - " '인용격조사': 'JKQ'}),\n", - " '보조사': ('JX', {'보조사': 'JK'}),\n", - " '접속조사': ('JC', {'접속조사': 'JC'})},\n", - " '의존형태': {'어미': ('EM',\n", - " {'선어말어미': 'EP',\n", - " '종결어미': 'EF',\n", - " '연결어미': 'EC',\n", - " '명사형전성어미': 'ETN',\n", - " '관형형전성어미': 'ETM'}),\n", - " '접두사': ('XP', {'체언접두사': 'XPN'}),\n", - " '접미사': ('XS', {'명사파생접미사': 'XSN', '동사파생접미사': 'XSV', '형용사파생접미사': 'XSA'}),\n", - " '어근': ('XR', {'어근': 'XR'})},\n", - " '기호': {'일반기호': ('ST',\n", - " {'마침표, 물음표, 느낌표': 'SF',\n", - " '쉼표, 가운뎃점, 콜론, 빗금': 'SP',\n", - " '따옴표, 괄호표, 줄표': 'SS',\n", - " '줄임표': 'SE',\n", - " '붙임표(물결)': 'SO',\n", - " '기타 기호': 'SW'}),\n", - " '외국어': ('SL', {'외국어': 'SL'}),\n", - " '한자': ('SH', {'한자': 'SH'}),\n", - " '숫자': ('SN', {'숫자': 'SN'}),\n", - " '분석불능범주': ('NA', {'분석불능범주': 'NA'})}}" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tta_guide" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [], - "source": [ - "pos_set = []\n", - "for VALUE in tta_guide.values():\n", - " for VALUE2 in VALUE.values():\n", - " pos_set.append(VALUE2[0])\n", - " for VALUE3 in VALUE2[1].values():\n", - " pos_set.append(VALUE3)\n", - "pos_set = list(set(pos_set))" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [], - "source": [ - "from konlpy.tag import Komoran\n", - "\n", - "komoran = Komoran()" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "[i[1] for i in komoran.pos(train_examples[0].text_a) if i[1] not in pos_set]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Komoran 형태소 분석기로 분석 실시" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "for exam in train_examples:\n", - " exam.text_a = ' '.join(\n", - " [i[0] + '/' + i[1] \n", - " for i in komoran.pos(exam.text_a)])" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'XXX/SL 지점/NNP 을/JKO 거래/NNG 하/XSV 아/EC 주/VX 시/EP 어서/EC 대단히/MAG 감사/NNG 하/XSV ㅂ니다/EF ./SF 내/NNP 점/NNB 하/NNP XXX/NNP 고객/NNG 님/XSN 고객/NNG 만족도/NNG 설문/NNP 조사/NNP 전화/NNG 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다라고/EC 우수/NNP 직원/NNP 추천/NNG 해주시/NNP 이/VCP 고/EC 사은품/NNG 받/VV 아/EC 가/VX 시/EP 어요/EC ../SE 더욱더/MAG 친절히/MAG 모시/VV 겠/EP 습니다/EC XXX/SL 은행/NNP XXX/NNP 올림/NNP'" - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_examples[0].text_a" - ] - }, - { - "cell_type": "code", - "execution_count": 243, - "metadata": {}, - "outputs": [], - "source": [ - "train_file = os.path.join(FLAGS.output_dir, 'train.tf_record')" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "# fn: file_based_convert_examples_to_features\n", - "\n", - "## Arguments\n", - "examples = train_examples\n", - "label_list = label_list\n", - "max_seq_length = FLAGS.max_seq_length\n", - "# tokenizer = FullTokenizer() # 예시로 tokenizing을 어떻게 하는지 전부 기록\n", - "output_file = train_file" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [], - "source": [ - "writer = tf.python_io.TFRecordWriter(output_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [], - "source": [ - "ex_index = 5\n", - "example = examples[ex_index]" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# fn: convert_single_example\n", - "isinstance(example, PaddingInputExample)" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'0': 0, '1': 1}" - ] - }, - "execution_count": 110, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "label_map = {}\n", - "for (i, label) in enumerate(label_list):\n", - " label_map[label] = i\n", - "label_map" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'XXX 고객님항상 XXX은행 모란역 지점을 이용해 주시는 고객님께 감사의 마음을 전합니다. 혹시 업무와 관련해 궁금한 점이 있으시면 이 번호로 연락주시기바랍니다. 성심껏 도와드리겠습니다. 또 혹시 고객만족도 조사 전화를 받으시면 매우 동의한다 로 칭찬해 주세요 조금은 쌀쌀한 10월의 첫주입니다.환절기 감기조심하시고 따듯한 차와 함께 건강한 한주 보내시기 바랍니다.XXX은행모란역XXX올림'" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# class: FullTokenizer\n", - "path = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'\n", - "FLAGS.vocab_file = path + 'vocab.korean_morp.list' \n", - "vocab_file = FLAGS.vocab_file\n", - "do_lower_case = FLAGS.do_lower_case\n", - "text = example.text_a\n", - "text" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [], - "source": [ - "vocab = load_vocab(vocab_file)\n", - "inv_vocab = {v:k for k, v in vocab.items()}" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [], - "source": [ - "openApiURL = \"http://aiopen.etri.re.kr:8000/WiseNLU\"\n", - "openapi_key = ''\n", - "requestJson = { \"access_key\": openapi_key, \"argument\": { \"text\": text, \"analysis_code\": \"morp\" } }" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'access_key': '0da70af6-e163-44b5-8d5d-f217bebb5765',\n", - " 'argument': {'text': 'XXX 고객님항상 XXX은행 모란역 지점을 이용해 주시는 고객님께 감사의 마음을 전합니다. 혹시 업무와 관련해 궁금한 점이 있으시면 이 번호로 연락주시기바랍니다. 성심껏 도와드리겠습니다. 또 혹시 고객만족도 조사 전화를 받으시면 매우 동의한다 로 칭찬해 주세요 조금은 쌀쌀한 10월의 첫주입니다.환절기 감기조심하시고 따듯한 차와 함께 건강한 한주 보내시기 바랍니다.XXX은행모란역XXX올림',\n", - " 'analysis_code': 'morp'}}" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "requestJson" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [], - "source": [ - "import urllib3" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": {}, - "outputs": [], - "source": [ - "http = urllib3.PoolManager()" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": {}, - "outputs": [], - "source": [ - "response = http.request( \"POST\", openApiURL, headers={\"Content-Type\": \"application/json; charset=UTF-8\"}, body=json.dumps(requestJson))" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [], - "source": [ - "json_data = json.loads(response.data.decode('utf-8'))\n", - "json_result = json_data[\"result\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 119, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "json_result" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [], - "source": [ - "json_data = json.loads(response.data.decode('utf-8'))\n", - "json_return_obj = json_data[\"return_object\"]\n", - "return_result = \"\"\n", - "json_sentence = json_return_obj[\"sentence\"]\n", - "for json_morp in json_sentence: \n", - " for morp in json_morp[\"morp\"]:\n", - " return_result = return_result+str(morp[\"lemma\"])+\"/\"+str(morp[\"type\"])+\" \"" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG '" - ] - }, - "execution_count": 121, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "return_result" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'XXX/SL 고객/NNG 님/NNG 항상/MAG XXX/SL 은행/NNP 모란역/NNP 지점/NNG 을/JKO 이용/NNG 하/XSV 아/EC 주시/NNP 는/JX 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JC 관련/NNG 하/XSV 아/EC 궁금/XR 하/XSA ㄴ/ETM 점/NNB 이/JKS 있/VX 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/NNG 시기/NNG 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAJ 혹시/MAG 고객/NNP 만족/NNP 도/JX 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EC 로/NNG 칭찬/NNG 하/XSV 아/EC 주/VX 시/EP 어요/EC 조금/NNG 은/JX 쌀쌀/XR 하/XSA ㄴ/ETM 10월/NNP 의/JKG 첫/MM 주/NNB 이/VCP ㅂ니다/EF ./SF 환절기/NNG 감기/NNP 조심/NNG 하/XSV 시/EP 고/EC 따듯/XR 하/XSA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSV ㄴ/ETM 한/MM 주/NNP 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNP 모란역/NNP XXX/NNP 올림/NNP'" - ] - }, - "execution_count": 124, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "' '.join([i[0] + '/' + i[1] for i in komoran.pos(text)])" - ] - }, - { - "cell_type": "code", - "execution_count": 125, - "metadata": {}, - "outputs": [], - "source": [ - "def do_lang ( openapi_key, text ) :\n", - " openApiURL = \"http://aiopen.etri.re.kr:8000/WiseNLU\"\n", - "\t \n", - " requestJson = { \"access_key\": openapi_key, \"argument\": { \"text\": text, \"analysis_code\": \"morp\" } }\n", - "\t \n", - " http = urllib3.PoolManager()\n", - " response = http.request( \"POST\", openApiURL, headers={\"Content-Type\": \"application/json; charset=UTF-8\"}, body=json.dumps(requestJson))\n", - " \n", - " json_data = json.loads(response.data.decode('utf-8'))\n", - " json_result = json_data[\"result\"]\n", - " \n", - " if json_result == -1:\n", - " json_reason = json_data[\"reason\"]\n", - " if \"Invalid Access Key\" in json_reason:\n", - " logger.info(json_reason)\n", - " logger.info(\"Please check the openapi access key.\")\n", - " sys.exit()\n", - " return \"openapi error - \" + json_reason \n", - " else:\n", - " json_data = json.loads(response.data.decode('utf-8'))\n", - " \n", - " json_return_obj = json_data[\"return_object\"]\n", - " \n", - " return_result = \"\"\n", - " json_sentence = json_return_obj[\"sentence\"]\n", - " for json_morp in json_sentence: \n", - " for morp in json_morp[\"morp\"]:\n", - " return_result = return_result+str(morp[\"lemma\"])+\"/\"+str(morp[\"type\"])+\" \"\n", - "\n", - " return return_result" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG '" - ] - }, - "execution_count": 127, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "return_result" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "ids_to_tokens = collections.OrderedDict(\n", - " [(ids, tok) for tok, ids in vocab.items()])" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "metadata": {}, - "outputs": [], - "source": [ - "never_split=(\"[UNK]\", \"[SEP]\", \"[PAD]\", \"[CLS]\", \"[MASK]\")" - ] - }, - { - "cell_type": "code", - "execution_count": 133, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 133, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "do_lower_case" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/vocab.korean_morp.list'" - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vocab_file" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 138, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "os.path.isdir(vocab_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "metadata": {}, - "outputs": [], - "source": [ - "from pathlib import Path\n", - "\n", - "from torch.hub import _get_torch_home\n", - "\n", - "torch_cache_home = _get_torch_home()\n", - " \n", - "default_cache_path = os.path.join(torch_cache_home, \"transformers\")\n", - "\n", - "PYTORCH_PRETRAINED_BERT_CACHE = Path(\n", - " os.getenv(\"PYTORCH_TRANSFORMERS_CACHE\", os.getenv(\"PYTORCH_PRETRAINED_BERT_CACHE\", default_cache_path))\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": {}, - "outputs": [], - "source": [ - "TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 142, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "isinstance(vocab_file, Path)" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'C:\\\\Users\\\\jinma\\\\.cache\\\\torch\\\\transformers'" - ] - }, - "execution_count": 144, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "if isinstance(TRANSFORMERS_CACHE, Path):\n", - " cache_dir = str(TRANSFORMERS_CACHE)\n", - "cache_dir" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "metadata": {}, - "outputs": [], - "source": [ - "from urllib.parse import urlparse\n", - "\n", - "def is_remote_url(url_or_filename):\n", - " parsed = urlparse(url_or_filename)\n", - " return parsed.scheme in ('http', 'https', 's3')" - ] - }, - { - "cell_type": "code", - "execution_count": 272, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['[CLS]']" - ] - }, - "execution_count": 272, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokens = []\n", - "tokens.append('[CLS]')\n", - "tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 149, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "is_remote_url(vocab_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 150, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 150, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "os.path.exists(vocab_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 151, - "metadata": {}, - "outputs": [], - "source": [ - "def cached_path(\n", - " url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None\n", - "):\n", - " if cache_dir is None:\n", - " cache_dir = TRANSFORMERS_CACHE\n", - " if isinstance(url_or_filename, Path):\n", - " url_or_filename = str(url_or_filename)\n", - " if isinstance(cache_dir, Path):\n", - " cache_dir = str(cache_dir)\n", - "\n", - " if is_remote_url(url_or_filename):\n", - " # URL, so get it from the cache (downloading if necessary)\n", - " return get_from_cache(\n", - " url_or_filename,\n", - " cache_dir=cache_dir,\n", - " force_download=force_download,\n", - " proxies=proxies,\n", - " resume_download=resume_download,\n", - " user_agent=user_agent,\n", - " )\n", - " elif os.path.exists(url_or_filename):\n", - " # File, and it exists.\n", - " return url_or_filename\n", - " elif urlparse(url_or_filename).scheme == \"\":\n", - " # File, but it doesn't exist.\n", - " raise EnvironmentError(\"file {} not found\".format(url_or_filename))\n", - " else:\n", - " # Something unknown\n", - " raise ValueError(\"unable to parse {} as a URL or as a local path\".format(url_or_filename))" - ] - }, - { - "cell_type": "code", - "execution_count": 152, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/vocab.korean_morp.list'" - ] - }, - "execution_count": 152, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cached_path(vocab_file) # vocab file 반환" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "metadata": {}, - "outputs": [], - "source": [ - "def _clean_text(text):\n", - " output = [] # char을 저장할 list 생성\n", - " for char in text:\n", - " # 텍스트에서 Char 단위로 출력\n", - " cp = ord(char)\n", - " if cp == 0 or cp == 0xfffd or _is_control(char):\n", - " # \\x00이거나 �이거나 unicode cat.이 C로 시작할 경우\n", - " # (개행문자 제외) output에 추가하지 않는다.\n", - " continue\n", - " if _is_whitespace(char):\n", - " # 공백일 경우 \" \"으로 output에 추가\n", - " output.append(\" \")\n", - " else:\n", - " # 이 외의 경우 전부 output에 추가\n", - " output.append(char)\n", - " # cleaning 작업을 거친 Text를 후처리하여 반환\n", - " return \"\".join(output)\n", - "\n", - "# char 단위 함수들\n", - "def _is_whitespace(char):\n", - " if char == \" \" or char == '\\t' or char == '\\n' or char == '\\r':\n", - " # 개행문자이거나 띄어쓰기면 True 반환\n", - " return True\n", - " cat = unicodedata.category(char)\n", - " if cat == 'Zs':\n", - " # unicode category가 Space Seperator면 True 반환\n", - " return True\n", - " # 이 외의 경우 전부 False 반환\n", - " return False\n", - "\n", - "def _is_control(char):\n", - " if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n", - " # 개행문자이면 False 반환\n", - " return False\n", - " cat = unicodedata.category(char)\n", - " if cat.startswith(\"C\"):\n", - " # unicode category가\n", - " # Cc(Control) \n", - " # Cf(format)\n", - " # Co(Private Use, is 0)\n", - " # Cs(Surrrogate, is 0)일 경우, True 반환\n", - " return True\n", - " # 이 외의 경우 전부 False 반환\n", - " return False\n", - "\n", - "def _is_punctuation(char):\n", - " # 한국어 형태소 분석기이기 때문에 공백과 같은지 여부만 반환\n", - " return char == ' '" - ] - }, - { - "cell_type": "code", - "execution_count": 157, - "metadata": {}, - "outputs": [], - "source": [ - "def whitespace_tokenize(text):\n", - "\t\"\"\"Runs basic whitespace cleaning and splitting on a peice of text.\"\"\"\n", - "\ttext = text.strip()\n", - "\tif not text:\n", - "\t\treturn []\n", - "\ttokens = text.split()\n", - "\treturn tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [], - "source": [ - "def print_c(text, is_print):\n", - " if is_print:\n", - " print(text)\n", - " else:\n", - " print(end='')" - ] - }, - { - "cell_type": "code", - "execution_count": 248, - "metadata": {}, - "outputs": [], - "source": [ - "# do_lower_case = False\n", - "do_lower_case = True" - ] - }, - { - "cell_type": "code", - "execution_count": 258, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "곧/NNG\n", - "곧/NNG\n", - "False\n", - "True\n", - "True\n" - ] - } - ], - "source": [ - "print('곧/NNG')\n", - "print(unicodedata.normalize(\"NFD\", '곧/NNG'))\n", - "print('곧/NNG' == unicodedata.normalize(\"NFD\", '곧/NNG'))\n", - "print('곧/NNG' == unicodedata.normalize(\"NFC\",\n", - " unicodedata.normalize(\"NFD\", '곧/NNG')))\n", - "print(unicodedata.normalize(\"NFC\", '곧/NNG') == '곧/NNG')" - ] - }, - { - "cell_type": "code", - "execution_count": 264, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "' --|> Exists in vocab_file.'" - ] - }, - "execution_count": 264, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'{:>30}'.format('--|> Exists in vocab_file.')" - ] - }, - { - "cell_type": "code", - "execution_count": 270, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "************** START TOKENING MORPHLOGY **************\n", - "\n", - "Origin Text: XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG \n", - "\n", - "Cleaned Text: XXX/SL 고객/NNG 님/XSN 항상/MAG XXX/SL 은행/NNG 모란/NNG 역/NNG 지점/NNG 을/JKO 이용/NNG 하/XSV 어/EC 주/VX 시/EP 는/ETM 고객/NNG 님/XSN 께/JKB 감사/NNG 의/JKG 마음/NNG 을/JKO 전하/VV ㅂ니다/EF ./SF 혹시/MAG 업무/NNG 와/JKB 관련/NNG 하/XSV 어/EC 궁금하/VA ㄴ/ETM 점/NNG 이/JKS 있/VA 으시/EP 면/EC 이/MM 번호/NNG 로/JKB 연락/NNG 주/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF 성심껏/MAG 돕/VV 아/EC 드리/VX 겠/EP 습니다/EF ./SF 또/MAG 혹시/MAG 고객/NNG 만족/NNG 도/NNG 조사/NNG 전화/NNG 를/JKO 받/VV 으시/EP 면/EC 매우/MAG 동의/NNG 하/XSV ㄴ다/EF 로/JKB 칭찬/NNG 하/XSV 어/EC 주/VX 시/EP 어요/EF 조금/NNG 은/JX 쌀쌀하/VA ㄴ/ETM 10/SN 월/NNB 의/JKG 첫주/NNG 이/VCP ㅂ니다/EF ./SF 환절/NNG 기/XSN 감기/NNG 조심/NNG 하/XSV 시/EP 고/EC 따듯하/VA ㄴ/ETM 차/NNG 와/JC 함께/MAG 건강/NNG 하/XSA ㄴ/ETM 한/MM 주/NNB 보내/VV 시/EP 기/ETN 바라/VV ㅂ니다/EF ./SF XXX/SL 은행/NNG 모란/NNG 역/NNG XXX/SL 올림/NNG \n", - "\n", - "Orig. Tokens: ['XXX/SL', '고객/NNG', '님/XSN', '항상/MAG', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', '지점/NNG', '을/JKO', '이용/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '는/ETM', '고객/NNG', '님/XSN', '께/JKB', '감사/NNG', '의/JKG', '마음/NNG', '을/JKO', '전하/VV', 'ㅂ니다/EF', './SF', '혹시/MAG', '업무/NNG', '와/JKB', '관련/NNG', '하/XSV', '어/EC', '궁금하/VA', 'ㄴ/ETM', '점/NNG', '이/JKS', '있/VA', '으시/EP', '면/EC', '이/MM', '번호/NNG', '로/JKB', '연락/NNG', '주/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', '성심껏/MAG', '돕/VV', '아/EC', '드리/VX', '겠/EP', '습니다/EF', './SF', '또/MAG', '혹시/MAG', '고객/NNG', '만족/NNG', '도/NNG', '조사/NNG', '전화/NNG', '를/JKO', '받/VV', '으시/EP', '면/EC', '매우/MAG', '동의/NNG', '하/XSV', 'ㄴ다/EF', '로/JKB', '칭찬/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '어요/EF', '조금/NNG', '은/JX', '쌀쌀하/VA', 'ㄴ/ETM', '10/SN', '월/NNB', '의/JKG', '첫주/NNG', '이/VCP', 'ㅂ니다/EF', './SF', '환절/NNG', '기/XSN', '감기/NNG', '조심/NNG', '하/XSV', '시/EP', '고/EC', '따듯하/VA', 'ㄴ/ETM', '차/NNG', '와/JC', '함께/MAG', '건강/NNG', '하/XSA', 'ㄴ/ETM', '한/MM', '주/NNB', '보내/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', 'XXX/SL', '올림/NNG']\n", - "\n", - "\tstripped accent+norm(NFD) Token : XXX/SL\n", - "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n", - "\tstripped accent+norm(NFD) Token : 고객/NNG\n", - "\tchars : ['ᄀ', 'ᅩ', 'ᄀ', 'ᅢ', 'ᆨ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 님/XSN\n", - "\tchars : ['ᄂ', 'ᅵ', 'ᆷ', '/', 'X', 'S', 'N']\n", - "\tstripped accent+norm(NFD) Token : 항상/MAG\n", - "\tchars : ['ᄒ', 'ᅡ', 'ᆼ', 'ᄉ', 'ᅡ', 'ᆼ', '/', 'M', 'A', 'G']\n", - "\tstripped accent+norm(NFD) Token : XXX/SL\n", - "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n", - "\tstripped accent+norm(NFD) Token : 은행/NNG\n", - "\tchars : ['ᄋ', 'ᅳ', 'ᆫ', 'ᄒ', 'ᅢ', 'ᆼ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 모란/NNG\n", - "\tchars : ['ᄆ', 'ᅩ', 'ᄅ', 'ᅡ', 'ᆫ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 역/NNG\n", - "\tchars : ['ᄋ', 'ᅧ', 'ᆨ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 지점/NNG\n", - "\tchars : ['ᄌ', 'ᅵ', 'ᄌ', 'ᅥ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 을/JKO\n", - "\tchars : ['ᄋ', 'ᅳ', 'ᆯ', '/', 'J', 'K', 'O']\n", - "\tstripped accent+norm(NFD) Token : 이용/NNG\n", - "\tchars : ['ᄋ', 'ᅵ', 'ᄋ', 'ᅭ', 'ᆼ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 하/XSV\n", - "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n", - "\tstripped accent+norm(NFD) Token : 어/EC\n", - "\tchars : ['ᄋ', 'ᅥ', '/', 'E', 'C']\n", - "\tstripped accent+norm(NFD) Token : 주/VX\n", - "\tchars : ['ᄌ', 'ᅮ', '/', 'V', 'X']\n", - "\tstripped accent+norm(NFD) Token : 시/EP\n", - "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 는/ETM\n", - "\tchars : ['ᄂ', 'ᅳ', 'ᆫ', '/', 'E', 'T', 'M']\n", - "\tstripped accent+norm(NFD) Token : 고객/NNG\n", - "\tchars : ['ᄀ', 'ᅩ', 'ᄀ', 'ᅢ', 'ᆨ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 님/XSN\n", - "\tchars : ['ᄂ', 'ᅵ', 'ᆷ', '/', 'X', 'S', 'N']\n", - "\tstripped accent+norm(NFD) Token : 께/JKB\n", - "\tchars : ['ᄁ', 'ᅦ', '/', 'J', 'K', 'B']\n", - "\tstripped accent+norm(NFD) Token : 감사/NNG\n", - "\tchars : ['ᄀ', 'ᅡ', 'ᆷ', 'ᄉ', 'ᅡ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 의/JKG\n", - "\tchars : ['ᄋ', 'ᅴ', '/', 'J', 'K', 'G']\n", - "\tstripped accent+norm(NFD) Token : 마음/NNG\n", - "\tchars : ['ᄆ', 'ᅡ', 'ᄋ', 'ᅳ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 을/JKO\n", - "\tchars : ['ᄋ', 'ᅳ', 'ᆯ', '/', 'J', 'K', 'O']\n", - "\tstripped accent+norm(NFD) Token : 전하/VV\n", - "\tchars : ['ᄌ', 'ᅥ', 'ᆫ', 'ᄒ', 'ᅡ', '/', 'V', 'V']\n", - "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n", - "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n", - "\tstripped accent+norm(NFD) Token : ./SF\n", - "\tchars : ['.', '/', 'S', 'F']\n", - "\tstripped accent+norm(NFD) Token : 혹시/MAG\n", - "\tchars : ['ᄒ', 'ᅩ', 'ᆨ', 'ᄉ', 'ᅵ', '/', 'M', 'A', 'G']\n", - "\tstripped accent+norm(NFD) Token : 업무/NNG\n", - "\tchars : ['ᄋ', 'ᅥ', 'ᆸ', 'ᄆ', 'ᅮ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 와/JKB\n", - "\tchars : ['ᄋ', 'ᅪ', '/', 'J', 'K', 'B']\n", - "\tstripped accent+norm(NFD) Token : 관련/NNG\n", - "\tchars : ['ᄀ', 'ᅪ', 'ᆫ', 'ᄅ', 'ᅧ', 'ᆫ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 하/XSV\n", - "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n", - "\tstripped accent+norm(NFD) Token : 어/EC\n", - "\tchars : ['ᄋ', 'ᅥ', '/', 'E', 'C']\n", - "\tstripped accent+norm(NFD) Token : 궁금하/VA\n", - "\tchars : ['ᄀ', 'ᅮ', 'ᆼ', 'ᄀ', 'ᅳ', 'ᆷ', 'ᄒ', 'ᅡ', '/', 'V', 'A']\n", - "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n", - "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n", - "\tstripped accent+norm(NFD) Token : 점/NNG\n", - "\tchars : ['ᄌ', 'ᅥ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 이/JKS\n", - "\tchars : ['ᄋ', 'ᅵ', '/', 'J', 'K', 'S']\n", - "\tstripped accent+norm(NFD) Token : 있/VA\n", - "\tchars : ['ᄋ', 'ᅵ', 'ᆻ', '/', 'V', 'A']\n", - "\tstripped accent+norm(NFD) Token : 으시/EP\n", - "\tchars : ['ᄋ', 'ᅳ', 'ᄉ', 'ᅵ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 면/EC\n", - "\tchars : ['ᄆ', 'ᅧ', 'ᆫ', '/', 'E', 'C']\n", - "\tstripped accent+norm(NFD) Token : 이/MM\n", - "\tchars : ['ᄋ', 'ᅵ', '/', 'M', 'M']\n", - "\tstripped accent+norm(NFD) Token : 번호/NNG\n", - "\tchars : ['ᄇ', 'ᅥ', 'ᆫ', 'ᄒ', 'ᅩ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 로/JKB\n", - "\tchars : ['ᄅ', 'ᅩ', '/', 'J', 'K', 'B']\n", - "\tstripped accent+norm(NFD) Token : 연락/NNG\n", - "\tchars : ['ᄋ', 'ᅧ', 'ᆫ', 'ᄅ', 'ᅡ', 'ᆨ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 주/VV\n", - "\tchars : ['ᄌ', 'ᅮ', '/', 'V', 'V']\n", - "\tstripped accent+norm(NFD) Token : 시/EP\n", - "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 기/ETN\n", - "\tchars : ['ᄀ', 'ᅵ', '/', 'E', 'T', 'N']\n", - "\tstripped accent+norm(NFD) Token : 바라/VV\n", - "\tchars : ['ᄇ', 'ᅡ', 'ᄅ', 'ᅡ', '/', 'V', 'V']\n", - "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n", - "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n", - "\tstripped accent+norm(NFD) Token : ./SF\n", - "\tchars : ['.', '/', 'S', 'F']\n", - "\tstripped accent+norm(NFD) Token : 성심껏/MAG\n", - "\tchars : ['ᄉ', 'ᅥ', 'ᆼ', 'ᄉ', 'ᅵ', 'ᆷ', 'ᄁ', 'ᅥ', 'ᆺ', '/', 'M', 'A', 'G']\n", - "\tstripped accent+norm(NFD) Token : 돕/VV\n", - "\tchars : ['ᄃ', 'ᅩ', 'ᆸ', '/', 'V', 'V']\n", - "\tstripped accent+norm(NFD) Token : 아/EC\n", - "\tchars : ['ᄋ', 'ᅡ', '/', 'E', 'C']\n", - "\tstripped accent+norm(NFD) Token : 드리/VX\n", - "\tchars : ['ᄃ', 'ᅳ', 'ᄅ', 'ᅵ', '/', 'V', 'X']\n", - "\tstripped accent+norm(NFD) Token : 겠/EP\n", - "\tchars : ['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 습니다/EF\n", - "\tchars : ['ᄉ', 'ᅳ', 'ᆸ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n", - "\tstripped accent+norm(NFD) Token : ./SF\n", - "\tchars : ['.', '/', 'S', 'F']\n", - "\tstripped accent+norm(NFD) Token : 또/MAG\n", - "\tchars : ['ᄄ', 'ᅩ', '/', 'M', 'A', 'G']\n", - "\tstripped accent+norm(NFD) Token : 혹시/MAG\n", - "\tchars : ['ᄒ', 'ᅩ', 'ᆨ', 'ᄉ', 'ᅵ', '/', 'M', 'A', 'G']\n", - "\tstripped accent+norm(NFD) Token : 고객/NNG\n", - "\tchars : ['ᄀ', 'ᅩ', 'ᄀ', 'ᅢ', 'ᆨ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 만족/NNG\n", - "\tchars : ['ᄆ', 'ᅡ', 'ᆫ', 'ᄌ', 'ᅩ', 'ᆨ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 도/NNG\n", - "\tchars : ['ᄃ', 'ᅩ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 조사/NNG\n", - "\tchars : ['ᄌ', 'ᅩ', 'ᄉ', 'ᅡ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 전화/NNG\n", - "\tchars : ['ᄌ', 'ᅥ', 'ᆫ', 'ᄒ', 'ᅪ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 를/JKO\n", - "\tchars : ['ᄅ', 'ᅳ', 'ᆯ', '/', 'J', 'K', 'O']\n", - "\tstripped accent+norm(NFD) Token : 받/VV\n", - "\tchars : ['ᄇ', 'ᅡ', 'ᆮ', '/', 'V', 'V']\n", - "\tstripped accent+norm(NFD) Token : 으시/EP\n", - "\tchars : ['ᄋ', 'ᅳ', 'ᄉ', 'ᅵ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 면/EC\n", - "\tchars : ['ᄆ', 'ᅧ', 'ᆫ', '/', 'E', 'C']\n", - "\tstripped accent+norm(NFD) Token : 매우/MAG\n", - "\tchars : ['ᄆ', 'ᅢ', 'ᄋ', 'ᅮ', '/', 'M', 'A', 'G']\n", - "\tstripped accent+norm(NFD) Token : 동의/NNG\n", - "\tchars : ['ᄃ', 'ᅩ', 'ᆼ', 'ᄋ', 'ᅴ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 하/XSV\n", - "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n", - "\tstripped accent+norm(NFD) Token : ㄴ다/EF\n", - "\tchars : ['ㄴ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n", - "\tstripped accent+norm(NFD) Token : 로/JKB\n", - "\tchars : ['ᄅ', 'ᅩ', '/', 'J', 'K', 'B']\n", - "\tstripped accent+norm(NFD) Token : 칭찬/NNG\n", - "\tchars : ['ᄎ', 'ᅵ', 'ᆼ', 'ᄎ', 'ᅡ', 'ᆫ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 하/XSV\n", - "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n", - "\tstripped accent+norm(NFD) Token : 어/EC\n", - "\tchars : ['ᄋ', 'ᅥ', '/', 'E', 'C']\n", - "\tstripped accent+norm(NFD) Token : 주/VX\n", - "\tchars : ['ᄌ', 'ᅮ', '/', 'V', 'X']\n", - "\tstripped accent+norm(NFD) Token : 시/EP\n", - "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 어요/EF\n", - "\tchars : ['ᄋ', 'ᅥ', 'ᄋ', 'ᅭ', '/', 'E', 'F']\n", - "\tstripped accent+norm(NFD) Token : 조금/NNG\n", - "\tchars : ['ᄌ', 'ᅩ', 'ᄀ', 'ᅳ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 은/JX\n", - "\tchars : ['ᄋ', 'ᅳ', 'ᆫ', '/', 'J', 'X']\n", - "\tstripped accent+norm(NFD) Token : 쌀쌀하/VA\n", - "\tchars : ['ᄊ', 'ᅡ', 'ᆯ', 'ᄊ', 'ᅡ', 'ᆯ', 'ᄒ', 'ᅡ', '/', 'V', 'A']\n", - "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n", - "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n", - "\tstripped accent+norm(NFD) Token : 10/SN\n", - "\tchars : ['1', '0', '/', 'S', 'N']\n", - "\tstripped accent+norm(NFD) Token : 월/NNB\n", - "\tchars : ['ᄋ', 'ᅯ', 'ᆯ', '/', 'N', 'N', 'B']\n", - "\tstripped accent+norm(NFD) Token : 의/JKG\n", - "\tchars : ['ᄋ', 'ᅴ', '/', 'J', 'K', 'G']\n", - "\tstripped accent+norm(NFD) Token : 첫주/NNG\n", - "\tchars : ['ᄎ', 'ᅥ', 'ᆺ', 'ᄌ', 'ᅮ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 이/VCP\n", - "\tchars : ['ᄋ', 'ᅵ', '/', 'V', 'C', 'P']\n", - "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n", - "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n", - "\tstripped accent+norm(NFD) Token : ./SF\n", - "\tchars : ['.', '/', 'S', 'F']\n", - "\tstripped accent+norm(NFD) Token : 환절/NNG\n", - "\tchars : ['ᄒ', 'ᅪ', 'ᆫ', 'ᄌ', 'ᅥ', 'ᆯ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 기/XSN\n", - "\tchars : ['ᄀ', 'ᅵ', '/', 'X', 'S', 'N']\n", - "\tstripped accent+norm(NFD) Token : 감기/NNG\n", - "\tchars : ['ᄀ', 'ᅡ', 'ᆷ', 'ᄀ', 'ᅵ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 조심/NNG\n", - "\tchars : ['ᄌ', 'ᅩ', 'ᄉ', 'ᅵ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 하/XSV\n", - "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n", - "\tstripped accent+norm(NFD) Token : 시/EP\n", - "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 고/EC\n", - "\tchars : ['ᄀ', 'ᅩ', '/', 'E', 'C']\n", - "\tstripped accent+norm(NFD) Token : 따듯하/VA\n", - "\tchars : ['ᄄ', 'ᅡ', 'ᄃ', 'ᅳ', 'ᆺ', 'ᄒ', 'ᅡ', '/', 'V', 'A']\n", - "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n", - "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n", - "\tstripped accent+norm(NFD) Token : 차/NNG\n", - "\tchars : ['ᄎ', 'ᅡ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 와/JC\n", - "\tchars : ['ᄋ', 'ᅪ', '/', 'J', 'C']\n", - "\tstripped accent+norm(NFD) Token : 함께/MAG\n", - "\tchars : ['ᄒ', 'ᅡ', 'ᆷ', 'ᄁ', 'ᅦ', '/', 'M', 'A', 'G']\n", - "\tstripped accent+norm(NFD) Token : 건강/NNG\n", - "\tchars : ['ᄀ', 'ᅥ', 'ᆫ', 'ᄀ', 'ᅡ', 'ᆼ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 하/XSA\n", - "\tchars : ['ᄒ', 'ᅡ', '/', 'X', 'S', 'A']\n", - "\tstripped accent+norm(NFD) Token : ㄴ/ETM\n", - "\tchars : ['ㄴ', '/', 'E', 'T', 'M']\n", - "\tstripped accent+norm(NFD) Token : 한/MM\n", - "\tchars : ['ᄒ', 'ᅡ', 'ᆫ', '/', 'M', 'M']\n", - "\tstripped accent+norm(NFD) Token : 주/NNB\n", - "\tchars : ['ᄌ', 'ᅮ', '/', 'N', 'N', 'B']\n", - "\tstripped accent+norm(NFD) Token : 보내/VV\n", - "\tchars : ['ᄇ', 'ᅩ', 'ᄂ', 'ᅢ', '/', 'V', 'V']\n", - "\tstripped accent+norm(NFD) Token : 시/EP\n", - "\tchars : ['ᄉ', 'ᅵ', '/', 'E', 'P']\n", - "\tstripped accent+norm(NFD) Token : 기/ETN\n", - "\tchars : ['ᄀ', 'ᅵ', '/', 'E', 'T', 'N']\n", - "\tstripped accent+norm(NFD) Token : 바라/VV\n", - "\tchars : ['ᄇ', 'ᅡ', 'ᄅ', 'ᅡ', '/', 'V', 'V']\n", - "\tstripped accent+norm(NFD) Token : ㅂ니다/EF\n", - "\tchars : ['ㅂ', 'ᄂ', 'ᅵ', 'ᄃ', 'ᅡ', '/', 'E', 'F']\n", - "\tstripped accent+norm(NFD) Token : ./SF\n", - "\tchars : ['.', '/', 'S', 'F']\n", - "\tstripped accent+norm(NFD) Token : XXX/SL\n", - "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n", - "\tstripped accent+norm(NFD) Token : 은행/NNG\n", - "\tchars : ['ᄋ', 'ᅳ', 'ᆫ', 'ᄒ', 'ᅢ', 'ᆼ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 모란/NNG\n", - "\tchars : ['ᄆ', 'ᅩ', 'ᄅ', 'ᅡ', 'ᆫ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : 역/NNG\n", - "\tchars : ['ᄋ', 'ᅧ', 'ᆨ', '/', 'N', 'N', 'G']\n", - "\tstripped accent+norm(NFD) Token : XXX/SL\n", - "\tchars : ['X', 'X', 'X', '/', 'S', 'L']\n", - "\tstripped accent+norm(NFD) Token : 올림/NNG\n", - "\tchars : ['ᄋ', 'ᅩ', 'ᆯ', 'ᄅ', 'ᅵ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "\n", - "Basic_split_tokens : ['XXX/SL', '고객/NNG', '님/XSN', '항상/MAG', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', '지점/NNG', '을/JKO', '이용/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '는/ETM', '고객/NNG', '님/XSN', '께/JKB', '감사/NNG', '의/JKG', '마음/NNG', '을/JKO', '전하/VV', 'ㅂ니다/EF', './SF', '혹시/MAG', '업무/NNG', '와/JKB', '관련/NNG', '하/XSV', '어/EC', '궁금하/VA', 'ㄴ/ETM', '점/NNG', '이/JKS', '있/VA', '으시/EP', '면/EC', '이/MM', '번호/NNG', '로/JKB', '연락/NNG', '주/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', '성심껏/MAG', '돕/VV', '아/EC', '드리/VX', '겠/EP', '습니다/EF', './SF', '또/MAG', '혹시/MAG', '고객/NNG', '만족/NNG', '도/NNG', '조사/NNG', '전화/NNG', '를/JKO', '받/VV', '으시/EP', '면/EC', '매우/MAG', '동의/NNG', '하/XSV', 'ㄴ다/EF', '로/JKB', '칭찬/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '어요/EF', '조금/NNG', '은/JX', '쌀쌀하/VA', 'ㄴ/ETM', '10/SN', '월/NNB', '의/JKG', '첫주/NNG', '이/VCP', 'ㅂ니다/EF', './SF', '환절/NNG', '기/XSN', '감기/NNG', '조심/NNG', '하/XSV', '시/EP', '고/EC', '따듯하/VA', 'ㄴ/ETM', '차/NNG', '와/JC', '함께/MAG', '건강/NNG', '하/XSA', 'ㄴ/ETM', '한/MM', '주/NNB', '보내/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', 'XXX/SL', '올림/NNG']\n", - "\n", - "Basic_output Tokens: ['XXX/SL', '고객/NNG', '님/XSN', '항상/MAG', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', '지점/NNG', '을/JKO', '이용/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '는/ETM', '고객/NNG', '님/XSN', '께/JKB', '감사/NNG', '의/JKG', '마음/NNG', '을/JKO', '전하/VV', 'ㅂ니다/EF', './SF', '혹시/MAG', '업무/NNG', '와/JKB', '관련/NNG', '하/XSV', '어/EC', '궁금하/VA', 'ㄴ/ETM', '점/NNG', '이/JKS', '있/VA', '으시/EP', '면/EC', '이/MM', '번호/NNG', '로/JKB', '연락/NNG', '주/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', '성심껏/MAG', '돕/VV', '아/EC', '드리/VX', '겠/EP', '습니다/EF', './SF', '또/MAG', '혹시/MAG', '고객/NNG', '만족/NNG', '도/NNG', '조사/NNG', '전화/NNG', '를/JKO', '받/VV', '으시/EP', '면/EC', '매우/MAG', '동의/NNG', '하/XSV', 'ㄴ다/EF', '로/JKB', '칭찬/NNG', '하/XSV', '어/EC', '주/VX', '시/EP', '어요/EF', '조금/NNG', '은/JX', '쌀쌀하/VA', 'ㄴ/ETM', '10/SN', '월/NNB', '의/JKG', '첫주/NNG', '이/VCP', 'ㅂ니다/EF', './SF', '환절/NNG', '기/XSN', '감기/NNG', '조심/NNG', '하/XSV', '시/EP', '고/EC', '따듯하/VA', 'ㄴ/ETM', '차/NNG', '와/JC', '함께/MAG', '건강/NNG', '하/XSA', 'ㄴ/ETM', '한/MM', '주/NNB', '보내/VV', '시/EP', '기/ETN', '바라/VV', 'ㅂ니다/EF', './SF', 'XXX/SL', '은행/NNG', '모란/NNG', '역/NNG', 'XXX/SL', '올림/NNG']\n", - "\n", - "************** START GREEDY LONGEST MATCH FIRST ALGORITHM **************\n", - "\t ['XXX/SL_']\n", - "\t\t\tXXX/SL_\n", - "\t\t\tXXX/SL\n", - "\t\t\tXXX/S\n", - "\t\t\tXXX/\n", - "\t\t\tXXX\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tXX/SL_\n", - "\t\t\tXX/SL\n", - "\t\t\tXX/S\n", - "\t\t\tXX/\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tX/SL_\r", - "\t\t\tX/SL_ --|> Exists in vocab_file.\n", - "\t ['고객/NNG_']\n", - "\t\t\t고객/NNG_\r", - "\t\t\t고객/NNG_ --|> Exists in vocab_file.\n", - "\t ['님/XSN_']\n", - "\t\t\t님/XSN_\r", - "\t\t\t님/XSN_ --|> Exists in vocab_file.\n", - "\t ['항상/MAG_']\n", - "\t\t\t항상/MAG_\r", - "\t\t\t항상/MAG_ --|> Exists in vocab_file.\n", - "\t ['XXX/SL_']\n", - "\t\t\tXXX/SL_\n", - "\t\t\tXXX/SL\n", - "\t\t\tXXX/S\n", - "\t\t\tXXX/\n", - "\t\t\tXXX\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tXX/SL_\n", - "\t\t\tXX/SL\n", - "\t\t\tXX/S\n", - "\t\t\tXX/\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tX/SL_\r", - "\t\t\tX/SL_ --|> Exists in vocab_file.\n", - "\t ['은행/NNG_']\n", - "\t\t\t은행/NNG_\r", - "\t\t\t은행/NNG_ --|> Exists in vocab_file.\n", - "\t ['모란/NNG_']\n", - "\t\t\t모란/NNG_\n", - "\t\t\t모란/NNG\n", - "\t\t\t모란/NN\n", - "\t\t\t모란/N\n", - "\t\t\t모란/\n", - "\t\t\t모란\n", - "\t\t\t모라\n", - "\t\t\t모ᄅ\n", - "\t\t\t모\r", - "\t\t\t모 --|> Exists in vocab_file.\n", - "\t\t\t란/NNG_\r", - "\t\t\t란/NNG_ --|> Exists in vocab_file.\n", - "\t ['역/NNG_']\n", - "\t\t\t역/NNG_\r", - "\t\t\t역/NNG_ --|> Exists in vocab_file.\n", - "\t ['지점/NNG_']\n", - "\t\t\t지점/NNG_\r", - "\t\t\t지점/NNG_ --|> Exists in vocab_file.\n", - "\t ['을/JKO_']\n", - "\t\t\t을/JKO_\r", - "\t\t\t을/JKO_ --|> Exists in vocab_file.\n", - "\t ['이용/NNG_']\n", - "\t\t\t이용/NNG_\r", - "\t\t\t이용/NNG_ --|> Exists in vocab_file.\n", - "\t ['하/XSV_']\n", - "\t\t\t하/XSV_\r", - "\t\t\t하/XSV_ --|> Exists in vocab_file.\n", - "\t ['어/EC_']\n", - "\t\t\t어/EC_\r", - "\t\t\t어/EC_ --|> Exists in vocab_file.\n", - "\t ['주/VX_']\n", - "\t\t\t주/VX_\r", - "\t\t\t주/VX_ --|> Exists in vocab_file.\n", - "\t ['시/EP_']\n", - "\t\t\t시/EP_\r", - "\t\t\t시/EP_ --|> Exists in vocab_file.\n", - "\t ['는/ETM_']\n", - "\t\t\t는/ETM_\r", - "\t\t\t는/ETM_ --|> Exists in vocab_file.\n", - "\t ['고객/NNG_']\n", - "\t\t\t고객/NNG_\r", - "\t\t\t고객/NNG_ --|> Exists in vocab_file.\n", - "\t ['님/XSN_']\n", - "\t\t\t님/XSN_\r", - "\t\t\t님/XSN_ --|> Exists in vocab_file.\n", - "\t ['께/JKB_']\n", - "\t\t\t께/JKB_\r", - "\t\t\t께/JKB_ --|> Exists in vocab_file.\n", - "\t ['감사/NNG_']\n", - "\t\t\t감사/NNG_\r", - "\t\t\t감사/NNG_ --|> Exists in vocab_file.\n", - "\t ['의/JKG_']\n", - "\t\t\t의/JKG_\r", - "\t\t\t의/JKG_ --|> Exists in vocab_file.\n", - "\t ['마음/NNG_']\n", - "\t\t\t마음/NNG_\r", - "\t\t\t마음/NNG_ --|> Exists in vocab_file.\n", - "\t ['을/JKO_']\n", - "\t\t\t을/JKO_\r", - "\t\t\t을/JKO_ --|> Exists in vocab_file.\n", - "\t ['전하/VV_']\n", - "\t\t\t전하/VV_\r", - "\t\t\t전하/VV_ --|> Exists in vocab_file.\n", - "\t ['ㅂ니다/EF_']\n", - "\t\t\tㅂ니다/EF_\r", - "\t\t\tㅂ니다/EF_ --|> Exists in vocab_file.\n", - "\t ['./SF_']\n", - "\t\t\t./SF_\r", - "\t\t\t./SF_ --|> Exists in vocab_file.\n", - "\t ['혹시/MAG_']\n", - "\t\t\t혹시/MAG_\r", - "\t\t\t혹시/MAG_ --|> Exists in vocab_file.\n", - "\t ['업무/NNG_']\n", - "\t\t\t업무/NNG_\r", - "\t\t\t업무/NNG_ --|> Exists in vocab_file.\n", - "\t ['와/JKB_']\n", - "\t\t\t와/JKB_\r", - "\t\t\t와/JKB_ --|> Exists in vocab_file.\n", - "\t ['관련/NNG_']\n", - "\t\t\t관련/NNG_\r", - "\t\t\t관련/NNG_ --|> Exists in vocab_file.\n", - "\t ['하/XSV_']\n", - "\t\t\t하/XSV_\r", - "\t\t\t하/XSV_ --|> Exists in vocab_file.\n", - "\t ['어/EC_']\n", - "\t\t\t어/EC_\r", - "\t\t\t어/EC_ --|> Exists in vocab_file.\n", - "\t ['궁금하/VA_']\n", - "\t\t\t궁금하/VA_\r", - "\t\t\t궁금하/VA_ --|> Exists in vocab_file.\n", - "\t ['ㄴ/ETM_']\n", - "\t\t\tㄴ/ETM_\r", - "\t\t\tㄴ/ETM_ --|> Exists in vocab_file.\n", - "\t ['점/NNG_']\n", - "\t\t\t점/NNG_\r", - "\t\t\t점/NNG_ --|> Exists in vocab_file.\n", - "\t ['이/JKS_']\n", - "\t\t\t이/JKS_\r", - "\t\t\t이/JKS_ --|> Exists in vocab_file.\n", - "\t ['있/VA_']\n", - "\t\t\t있/VA_\r", - "\t\t\t있/VA_ --|> Exists in vocab_file.\n", - "\t ['으시/EP_']\n", - "\t\t\t으시/EP_\r", - "\t\t\t으시/EP_ --|> Exists in vocab_file.\n", - "\t ['면/EC_']\n", - "\t\t\t면/EC_\r", - "\t\t\t면/EC_ --|> Exists in vocab_file.\n", - "\t ['이/MM_']\n", - "\t\t\t이/MM_\r", - "\t\t\t이/MM_ --|> Exists in vocab_file.\n", - "\t ['번호/NNG_']\n", - "\t\t\t번호/NNG_\r", - "\t\t\t번호/NNG_ --|> Exists in vocab_file.\n", - "\t ['로/JKB_']\n", - "\t\t\t로/JKB_\r", - "\t\t\t로/JKB_ --|> Exists in vocab_file.\n", - "\t ['연락/NNG_']\n", - "\t\t\t연락/NNG_\r", - "\t\t\t연락/NNG_ --|> Exists in vocab_file.\n", - "\t ['주/VV_']\n", - "\t\t\t주/VV_\r", - "\t\t\t주/VV_ --|> Exists in vocab_file.\n", - "\t ['시/EP_']\n", - "\t\t\t시/EP_\r", - "\t\t\t시/EP_ --|> Exists in vocab_file.\n", - "\t ['기/ETN_']\n", - "\t\t\t기/ETN_\r", - "\t\t\t기/ETN_ --|> Exists in vocab_file.\n", - "\t ['바라/VV_']\n", - "\t\t\t바라/VV_\r", - "\t\t\t바라/VV_ --|> Exists in vocab_file.\n", - "\t ['ㅂ니다/EF_']\n", - "\t\t\tㅂ니다/EF_\r", - "\t\t\tㅂ니다/EF_ --|> Exists in vocab_file.\n", - "\t ['./SF_']\n", - "\t\t\t./SF_\r", - "\t\t\t./SF_ --|> Exists in vocab_file.\n", - "\t ['성심껏/MAG_']\n", - "\t\t\t성심껏/MAG_\n", - "\t\t\t성심껏/MAG\n", - "\t\t\t성심껏/MA\n", - "\t\t\t성심껏/M\n", - "\t\t\t성심껏/\n", - "\t\t\t성심껏\n", - "\t\t\t성심꺼\n", - "\t\t\t성심ᄁ\n", - "\t\t\t성심\n", - "\t\t\t성시\n", - "\t\t\t성ᄉ\n", - "\t\t\t성\r", - "\t\t\t성 --|> Exists in vocab_file.\n", - "\t\t\t심껏/MAG_\n", - "\t\t\t심껏/MAG\n", - "\t\t\t심껏/MA\n", - "\t\t\t심껏/M\n", - "\t\t\t심껏/\n", - "\t\t\t심껏\n", - "\t\t\t심꺼\n", - "\t\t\t심ᄁ\n", - "\t\t\t심\r", - "\t\t\t심 --|> Exists in vocab_file.\n", - "\t\t\t껏/MAG_\r", - "\t\t\t껏/MAG_ --|> Exists in vocab_file.\n", - "\t ['돕/VV_']\n", - "\t\t\t돕/VV_\r", - "\t\t\t돕/VV_ --|> Exists in vocab_file.\n", - "\t ['아/EC_']\n", - "\t\t\t아/EC_\r", - "\t\t\t아/EC_ --|> Exists in vocab_file.\n", - "\t ['드리/VX_']\n", - "\t\t\t드리/VX_\r", - "\t\t\t드리/VX_ --|> Exists in vocab_file.\n", - "\t ['겠/EP_']\n", - "\t\t\t겠/EP_\r", - "\t\t\t겠/EP_ --|> Exists in vocab_file.\n", - "\t ['습니다/EF_']\n", - "\t\t\t습니다/EF_\r", - "\t\t\t습니다/EF_ --|> Exists in vocab_file.\n", - "\t ['./SF_']\n", - "\t\t\t./SF_\r", - "\t\t\t./SF_ --|> Exists in vocab_file.\n", - "\t ['또/MAG_']\n", - "\t\t\t또/MAG_\r", - "\t\t\t또/MAG_ --|> Exists in vocab_file.\n", - "\t ['혹시/MAG_']\n", - "\t\t\t혹시/MAG_\r", - "\t\t\t혹시/MAG_ --|> Exists in vocab_file.\n", - "\t ['고객/NNG_']\n", - "\t\t\t고객/NNG_\r", - "\t\t\t고객/NNG_ --|> Exists in vocab_file.\n", - "\t ['만족/NNG_']\n", - "\t\t\t만족/NNG_\r", - "\t\t\t만족/NNG_ --|> Exists in vocab_file.\n", - "\t ['도/NNG_']\n", - "\t\t\t도/NNG_\r", - "\t\t\t도/NNG_ --|> Exists in vocab_file.\n", - "\t ['조사/NNG_']\n", - "\t\t\t조사/NNG_\r", - "\t\t\t조사/NNG_ --|> Exists in vocab_file.\n", - "\t ['전화/NNG_']\n", - "\t\t\t전화/NNG_\r", - "\t\t\t전화/NNG_ --|> Exists in vocab_file.\n", - "\t ['를/JKO_']\n", - "\t\t\t를/JKO_\r", - "\t\t\t를/JKO_ --|> Exists in vocab_file.\n", - "\t ['받/VV_']\n", - "\t\t\t받/VV_\r", - "\t\t\t받/VV_ --|> Exists in vocab_file.\n", - "\t ['으시/EP_']\n", - "\t\t\t으시/EP_\r", - "\t\t\t으시/EP_ --|> Exists in vocab_file.\n", - "\t ['면/EC_']\n", - "\t\t\t면/EC_\r", - "\t\t\t면/EC_ --|> Exists in vocab_file.\n", - "\t ['매우/MAG_']\n", - "\t\t\t매우/MAG_\r", - "\t\t\t매우/MAG_ --|> Exists in vocab_file.\n", - "\t ['동의/NNG_']\n", - "\t\t\t동의/NNG_\r", - "\t\t\t동의/NNG_ --|> Exists in vocab_file.\n", - "\t ['하/XSV_']\n", - "\t\t\t하/XSV_\r", - "\t\t\t하/XSV_ --|> Exists in vocab_file.\n", - "\t ['ㄴ다/EF_']\n", - "\t\t\tㄴ다/EF_\r", - "\t\t\tㄴ다/EF_ --|> Exists in vocab_file.\n", - "\t ['로/JKB_']\n", - "\t\t\t로/JKB_\r", - "\t\t\t로/JKB_ --|> Exists in vocab_file.\n", - "\t ['칭찬/NNG_']\n", - "\t\t\t칭찬/NNG_\r", - "\t\t\t칭찬/NNG_ --|> Exists in vocab_file.\n", - "\t ['하/XSV_']\n", - "\t\t\t하/XSV_\r", - "\t\t\t하/XSV_ --|> Exists in vocab_file.\n", - "\t ['어/EC_']\n", - "\t\t\t어/EC_\r", - "\t\t\t어/EC_ --|> Exists in vocab_file.\n", - "\t ['주/VX_']\n", - "\t\t\t주/VX_\r", - "\t\t\t주/VX_ --|> Exists in vocab_file.\n", - "\t ['시/EP_']\n", - "\t\t\t시/EP_\r", - "\t\t\t시/EP_ --|> Exists in vocab_file.\n", - "\t ['어요/EF_']\n", - "\t\t\t어요/EF_\r", - "\t\t\t어요/EF_ --|> Exists in vocab_file.\n", - "\t ['조금/NNG_']\n", - "\t\t\t조금/NNG_\r", - "\t\t\t조금/NNG_ --|> Exists in vocab_file.\n", - "\t ['은/JX_']\n", - "\t\t\t은/JX_\r", - "\t\t\t은/JX_ --|> Exists in vocab_file.\n", - "\t ['쌀쌀하/VA_']\n", - "\t\t\t쌀쌀하/VA_\n", - "\t\t\t쌀쌀하/VA\n", - "\t\t\t쌀쌀하/V\n", - "\t\t\t쌀쌀하/\n", - "\t\t\t쌀쌀하\n", - "\t\t\t쌀쌀ᄒ\n", - "\t\t\t쌀쌀\n", - "\t\t\t쌀싸\n", - "\t\t\t쌀ᄊ\n", - "\t\t\t쌀\r", - "\t\t\t쌀 --|> Exists in vocab_file.\n", - "\t\t\t쌀하/VA_\n", - "\t\t\t쌀하/VA\n", - "\t\t\t쌀하/V\n", - "\t\t\t쌀하/\n", - "\t\t\t쌀하\n", - "\t\t\t쌀ᄒ\n", - "\t\t\t쌀\r", - "\t\t\t쌀 --|> Exists in vocab_file.\n", - "\t\t\t하/VA_\r", - "\t\t\t하/VA_ --|> Exists in vocab_file.\n", - "\t ['ㄴ/ETM_']\n", - "\t\t\tㄴ/ETM_\r", - "\t\t\tㄴ/ETM_ --|> Exists in vocab_file.\n", - "\t ['10/SN_']\n", - "\t\t\t10/SN_\r", - "\t\t\t10/SN_ --|> Exists in vocab_file.\n", - "\t ['월/NNB_']\n", - "\t\t\t월/NNB_\r", - "\t\t\t월/NNB_ --|> Exists in vocab_file.\n", - "\t ['의/JKG_']\n", - "\t\t\t의/JKG_\r", - "\t\t\t의/JKG_ --|> Exists in vocab_file.\n", - "\t ['첫주/NNG_']\n", - "\t\t\t첫주/NNG_\n", - "\t\t\t첫주/NNG\n", - "\t\t\t첫주/NN\n", - "\t\t\t첫주/N\n", - "\t\t\t첫주/\n", - "\t\t\t첫주\n", - "\t\t\t첫ᄌ\n", - "\t\t\t첫\r", - "\t\t\t첫 --|> Exists in vocab_file.\n", - "\t\t\t주/NNG_\r", - "\t\t\t주/NNG_ --|> Exists in vocab_file.\n", - "\t ['이/VCP_']\n", - "\t\t\t이/VCP_\r", - "\t\t\t이/VCP_ --|> Exists in vocab_file.\n", - "\t ['ㅂ니다/EF_']\n", - "\t\t\tㅂ니다/EF_\r", - "\t\t\tㅂ니다/EF_ --|> Exists in vocab_file.\n", - "\t ['./SF_']\n", - "\t\t\t./SF_\r", - "\t\t\t./SF_ --|> Exists in vocab_file.\n", - "\t ['환절/NNG_']\n", - "\t\t\t환절/NNG_\n", - "\t\t\t환절/NNG\n", - "\t\t\t환절/NN\n", - "\t\t\t환절/N\n", - "\t\t\t환절/\n", - "\t\t\t환절\n", - "\t\t\t환저\n", - "\t\t\t환ᄌ\n", - "\t\t\t환\r", - "\t\t\t환 --|> Exists in vocab_file.\n", - "\t\t\t절/NNG_\r", - "\t\t\t절/NNG_ --|> Exists in vocab_file.\n", - "\t ['기/XSN_']\n", - "\t\t\t기/XSN_\r", - "\t\t\t기/XSN_ --|> Exists in vocab_file.\n", - "\t ['감기/NNG_']\n", - "\t\t\t감기/NNG_\r", - "\t\t\t감기/NNG_ --|> Exists in vocab_file.\n", - "\t ['조심/NNG_']\n", - "\t\t\t조심/NNG_\r", - "\t\t\t조심/NNG_ --|> Exists in vocab_file.\n", - "\t ['하/XSV_']\n", - "\t\t\t하/XSV_\r", - "\t\t\t하/XSV_ --|> Exists in vocab_file.\n", - "\t ['시/EP_']\n", - "\t\t\t시/EP_\r", - "\t\t\t시/EP_ --|> Exists in vocab_file.\n", - "\t ['고/EC_']\n", - "\t\t\t고/EC_\r", - "\t\t\t고/EC_ --|> Exists in vocab_file.\n", - "\t ['따듯하/VA_']\n", - "\t\t\t따듯하/VA_\n", - "\t\t\t따듯하/VA\n", - "\t\t\t따듯하/V\n", - "\t\t\t따듯하/\n", - "\t\t\t따듯하\n", - "\t\t\t따듯ᄒ\n", - "\t\t\t따듯\n", - "\t\t\t따드\n", - "\t\t\t따ᄃ\n", - "\t\t\t따\r", - "\t\t\t따 --|> Exists in vocab_file.\n", - "\t\t\t듯하/VA_\r", - "\t\t\t듯하/VA_ --|> Exists in vocab_file.\n", - "\t ['ㄴ/ETM_']\n", - "\t\t\tㄴ/ETM_\r", - "\t\t\tㄴ/ETM_ --|> Exists in vocab_file.\n", - "\t ['차/NNG_']\n", - "\t\t\t차/NNG_\r", - "\t\t\t차/NNG_ --|> Exists in vocab_file.\n", - "\t ['와/JC_']\n", - "\t\t\t와/JC_\r", - "\t\t\t와/JC_ --|> Exists in vocab_file.\n", - "\t ['함께/MAG_']\n", - "\t\t\t함께/MAG_\r", - "\t\t\t함께/MAG_ --|> Exists in vocab_file.\n", - "\t ['건강/NNG_']\n", - "\t\t\t건강/NNG_\r", - "\t\t\t건강/NNG_ --|> Exists in vocab_file.\n", - "\t ['하/XSA_']\n", - "\t\t\t하/XSA_\r", - "\t\t\t하/XSA_ --|> Exists in vocab_file.\n", - "\t ['ㄴ/ETM_']\n", - "\t\t\tㄴ/ETM_\r", - "\t\t\tㄴ/ETM_ --|> Exists in vocab_file.\n", - "\t ['한/MM_']\n", - "\t\t\t한/MM_\r", - "\t\t\t한/MM_ --|> Exists in vocab_file.\n", - "\t ['주/NNB_']\n", - "\t\t\t주/NNB_\r", - "\t\t\t주/NNB_ --|> Exists in vocab_file.\n", - "\t ['보내/VV_']\n", - "\t\t\t보내/VV_\r", - "\t\t\t보내/VV_ --|> Exists in vocab_file.\n", - "\t ['시/EP_']\n", - "\t\t\t시/EP_\r", - "\t\t\t시/EP_ --|> Exists in vocab_file.\n", - "\t ['기/ETN_']\n", - "\t\t\t기/ETN_\r", - "\t\t\t기/ETN_ --|> Exists in vocab_file.\n", - "\t ['바라/VV_']\n", - "\t\t\t바라/VV_\r", - "\t\t\t바라/VV_ --|> Exists in vocab_file.\n", - "\t ['ㅂ니다/EF_']\n", - "\t\t\tㅂ니다/EF_\r", - "\t\t\tㅂ니다/EF_ --|> Exists in vocab_file.\n", - "\t ['./SF_']\n", - "\t\t\t./SF_\r", - "\t\t\t./SF_ --|> Exists in vocab_file.\n", - "\t ['XXX/SL_']\n", - "\t\t\tXXX/SL_\n", - "\t\t\tXXX/SL\n", - "\t\t\tXXX/S\n", - "\t\t\tXXX/\n", - "\t\t\tXXX\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tXX/SL_\n", - "\t\t\tXX/SL\n", - "\t\t\tXX/S\n", - "\t\t\tXX/\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tX/SL_\r", - "\t\t\tX/SL_ --|> Exists in vocab_file.\n", - "\t ['은행/NNG_']\n", - "\t\t\t은행/NNG_\r", - "\t\t\t은행/NNG_ --|> Exists in vocab_file.\n", - "\t ['모란/NNG_']\n", - "\t\t\t모란/NNG_\n", - "\t\t\t모란/NNG\n", - "\t\t\t모란/NN\n", - "\t\t\t모란/N\n", - "\t\t\t모란/\n", - "\t\t\t모란\n", - "\t\t\t모라\n", - "\t\t\t모ᄅ\n", - "\t\t\t모\r", - "\t\t\t모 --|> Exists in vocab_file.\n", - "\t\t\t란/NNG_\r", - "\t\t\t란/NNG_ --|> Exists in vocab_file.\n", - "\t ['역/NNG_']\n", - "\t\t\t역/NNG_\r", - "\t\t\t역/NNG_ --|> Exists in vocab_file.\n", - "\t ['XXX/SL_']\n", - "\t\t\tXXX/SL_\n", - "\t\t\tXXX/SL\n", - "\t\t\tXXX/S\n", - "\t\t\tXXX/\n", - "\t\t\tXXX\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tXX/SL_\n", - "\t\t\tXX/SL\n", - "\t\t\tXX/S\n", - "\t\t\tXX/\n", - "\t\t\tXX\n", - "\t\t\tX\r", - "\t\t\tX --|> Exists in vocab_file.\n", - "\t\t\tX/SL_\r", - "\t\t\tX/SL_ --|> Exists in vocab_file.\n", - "\t ['올림/NNG_']\n", - "\t\t\t올림/NNG_\r", - "\t\t\t올림/NNG_ --|> Exists in vocab_file.\n", - "\n", - "Total_split_tokens : ['X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', 'X', 'X', 'X/SL_', '올림/NNG_']\n" - ] - } - ], - "source": [ - "# FullTokenizer.tokenize(); End2End Tokenizer\n", - "text = copy.copy(return_result) # text 초기화\n", - "print('************** START TOKENING MORPHLOGY **************')\n", - "\n", - "# BasicTokenizer.tokenize()\n", - "print('\\nOrigin Text: ', text)\n", - "# text = convert_to_unicode(text)\n", - "text = _clean_text(text)\n", - "print('\\nCleaned Text: ', text)\n", - "# fn: whitespace_tokenize()\n", - "orig_tokens = whitespace_tokenize(text)\n", - "print('\\nOrig. Tokens: ', orig_tokens, end='\\n\\n')\n", - "Basic_split_tokens = []\n", - "for token in orig_tokens:\n", - " if (do_lower_case) and (token not in never_split):\n", - "# token = token.lower()\n", - " # fn: _run_strip_accents\n", - " t = unicodedata.normalize(\"NFD\", token)\n", - " # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d\n", - " # 모든 음절을 정준 분해(Canonical Decomposition)시킴\n", - " # '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일)\n", - " output = []\n", - " for char in t:\n", - " cat = unicodedata.category(char)\n", - " if cat == \"Mn\":\n", - " # unicode category가 \"Mark, Nonspacing\"일 경우 pass\n", - " continue\n", - " output.append(char)\n", - " token = \"\".join(output)\n", - " print('\\tstripped accent+norm(NFD) Token : '+t)\n", - " # fn: _run_split_on_punc()\n", - " if text in never_split:\n", - " token = [text]\n", - " else:\n", - " chars = list(token)\n", - " i, start_new_word = 0, True\n", - " output = []\n", - " print('\\tchars : '+str(chars))\n", - " while i < len(chars):\n", - " char = chars[i]\n", - " if _is_punctuation(char):\n", - " # 공백이면 [\" \"]를 추가하고 새로운 단어로 시작\n", - " output.append([char])\n", - " start_new_word = True\n", - " else:\n", - " # 공백이 아닐 경우,\n", - " if start_new_word:\n", - " # 새로운 문자로 시작할 경우 빈 리스트 추가.\n", - " output.append([])\n", - " # 해당 단어부터 시작하도록 start_new_word는 False로 setting.\n", - " start_new_word = False\n", - " # 위에 추가한 빈 리스트에 각각 character를 채워넣음\n", - " output[-1].append(char)\n", - " i += 1\n", - " token = [\"\".join(x) for x in output]\n", - " Basic_split_tokens.extend(token)\n", - "print('\\nBasic_split_tokens : ', Basic_split_tokens)\n", - "Basic_output_tokens = whitespace_tokenize((' '.join(Basic_split_tokens)).strip())\n", - "print('\\nBasic_output Tokens: ', Basic_output_tokens, end='\\n\\n')\n", - "\n", - "Total_split_tokens = [] # 최종 tokenize 결과 저장\n", - "print('************** START GREEDY LONGEST MATCH FIRST ALGORITHM **************')\n", - "for tokens in Basic_output_tokens:\n", - " tokens += '_' # adding '_'\n", - " # WordpieceTokenizer.tokenize()\n", - " unk_token = \"[UNK]\"\n", - " max_input_chars_per_word = 100\n", - " # greedy longest-match-first algorithm to perform tokenization\n", - " # using the given vocabulary\n", - " tokens = convert_to_unicode(tokens)\n", - " WP_output_tokens = []\n", - " # fn: whitespace_tokenize\n", - " tokens = whitespace_tokenize(tokens)\n", - " # start lmf algorithm!\n", - " print('\\t', tokens)\n", - " for token in tokens:\n", - " chars = list(token)\n", - " if len(chars) > max_input_chars_per_word: # 100\n", - " # max word로 설정한 글자 수를 넘길 경우, UNK 처리\n", - " WP_output_tokens.append(unk_token)\n", - " continue\n", - " is_bad = False\n", - " start = 0\n", - " sub_tokens = []\n", - " while start < len(chars):\n", - " end = len(chars)\n", - " cur_substr = None\n", - " # 첫번째 글자부터 천천히 vocab에 있는 단어인지 체크\n", - " while start < end:\n", - " substr = \"\".join(chars[start:end])\n", - " # do_lower_case == True일 경우에\n", - " # 위에서 Canonical Decomposition 과정을 거쳤기 때문에\n", - " # 이를 다시 Composition해줘야 vocab의 단어와 비교 가능하다.\n", - " substr = unicodedata.normalize('NFC', substr)\n", - " print('\\t\\t\\t'+substr, end='')\n", - " if substr in vocab:\n", - " # 만약 해당 단어가 vocab에 있다면 해당 단어로 break\n", - " cur_substr = substr\n", - " print('\\r\\t\\t\\t{:<15}{}'.format(\n", - " cur_substr, '--|> Exists in vocab_file.'))\n", - " break\n", - " end -= 1\n", - " print()\n", - " # 만일 못찾았을 경우, (1)로 가서 [UNK] 처리.\n", - " if cur_substr is None:\n", - " is_bad = True\n", - " break\n", - " sub_tokens.append(cur_substr)\n", - " # 어미를 추가하기 위해 start에 end값을 할당\n", - " start = end\n", - " if is_bad: # --- (1)\n", - " WP_output_tokens.append(unk_token)\n", - " else:\n", - " # 정상적으로 끝났다면 sub_token을 결과값에 할당\n", - " WP_output_tokens.extend(sub_tokens)\n", - " for sub_token in WP_output_tokens:\n", - " Total_split_tokens.append(sub_token)\n", - "print('\\nTotal_split_tokens : ', Total_split_tokens)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "단어 사전에 있는대로 짤라버린다." - ] - }, - { - "cell_type": "code", - "execution_count": 178, - "metadata": {}, - "outputs": [], - "source": [ - "tokens_a = Total_split_tokens " - ] - }, - { - "cell_type": "code", - "execution_count": 181, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', 'X', 'X', 'X/SL_', '올림/NNG_']\n", - "cutting\n", - "['X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X']\n" - ] - } - ], - "source": [ - "print(tokens_a)\n", - "if len(tokens_a) > max_seq_length - 2:\n", - " print('cutting')\n", - " tokens_a = tokens_a[:max_seq_length-2]\n", - "print(tokens_a)" - ] - }, - { - "cell_type": "code", - "execution_count": 203, - "metadata": {}, - "outputs": [], - "source": [ - "tokens = []\n", - "segment_ids = []\n", - "tokens.append(\"[CLS]\")\n", - "segment_ids.append(0)\n", - "for token in tokens_a:\n", - " tokens.append(token)\n", - " segment_ids.append(0)\n", - "tokens.append(\"[SEP]\")\n", - "segment_ids.append(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 204, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n" - ] - } - ], - "source": [ - "print(segment_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": 206, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['[CLS]', 'X', 'X', 'X/SL_', '고객/NNG_', '님/XSN_', '항상/MAG_', 'X', 'X', 'X/SL_', '은행/NNG_', '모', '란/NNG_', '역/NNG_', '지점/NNG_', '을/JKO_', '이용/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '는/ETM_', '고객/NNG_', '님/XSN_', '께/JKB_', '감사/NNG_', '의/JKG_', '마음/NNG_', '을/JKO_', '전하/VV_', 'ㅂ니다/EF_', './SF_', '혹시/MAG_', '업무/NNG_', '와/JKB_', '관련/NNG_', '하/XSV_', '어/EC_', '궁금하/VA_', 'ㄴ/ETM_', '점/NNG_', '이/JKS_', '있/VA_', '으시/EP_', '면/EC_', '이/MM_', '번호/NNG_', '로/JKB_', '연락/NNG_', '주/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', '성', '심', '껏/MAG_', '돕/VV_', '아/EC_', '드리/VX_', '겠/EP_', '습니다/EF_', './SF_', '또/MAG_', '혹시/MAG_', '고객/NNG_', '만족/NNG_', '도/NNG_', '조사/NNG_', '전화/NNG_', '를/JKO_', '받/VV_', '으시/EP_', '면/EC_', '매우/MAG_', '동의/NNG_', '하/XSV_', 'ㄴ다/EF_', '로/JKB_', '칭찬/NNG_', '하/XSV_', '어/EC_', '주/VX_', '시/EP_', '어요/EF_', '조금/NNG_', '은/JX_', '쌀', '쌀', '하/VA_', 'ㄴ/ETM_', '10/SN_', '월/NNB_', '의/JKG_', '첫', '주/NNG_', '이/VCP_', 'ㅂ니다/EF_', './SF_', '환', '절/NNG_', '기/XSN_', '감기/NNG_', '조심/NNG_', '하/XSV_', '시/EP_', '고/EC_', '따', '듯하/VA_', 'ㄴ/ETM_', '차/NNG_', '와/JC_', '함께/MAG_', '건강/NNG_', '하/XSA_', 'ㄴ/ETM_', '한/MM_', '주/NNB_', '보내/VV_', '시/EP_', '기/ETN_', '바라/VV_', 'ㅂ니다/EF_', './SF_', 'X', 'X', '[SEP]']\n" - ] - } - ], - "source": [ - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 207, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2, 3047, 3047, 1496, 1291, 1123, 2547, 3047, 3047, 1496, 994, 315, 1692, 375, 3277, 11, 456, 9, 20, 129, 388, 22, 1291, 1123, 3353, 1308, 13, 588, 11, 276, 158, 7, 5865, 1579, 101, 266, 9, 20, 4511, 10, 187, 16, 38, 4506, 71, 80, 1883, 31, 2597, 359, 388, 49, 2019, 158, 7, 270, 855, 5181, 2544, 62, 4971, 124, 116, 7, 179, 5865, 1291, 2379, 356, 268, 823, 19, 78, 4506, 71, 1210, 2680, 9, 41, 31, 3998, 9, 20, 129, 388, 526, 4380, 21, 9212, 9212, 248, 10, 113, 60, 13, 4508, 211, 15, 158, 7, 1807, 1509, 284, 9869, 4315, 9, 388, 23, 2577, 9115, 10, 208, 56, 162, 1394, 42, 10, 92, 2227, 561, 388, 49, 2019, 158, 7, 3047, 3047, 3]\n" - ] - } - ], - "source": [ - "# convert_tokens_to_ids\n", - "def convert_by_vocab(vocab, items):\n", - " output = []\n", - " for item in items:\n", - " output.append(vocab[item])\n", - " return output\n", - "\n", - "input_ids = convert_by_vocab(vocab, tokens)\n", - "print(input_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": 208, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n" - ] - } - ], - "source": [ - "# The mask has 1 for real tokens and 0 for padding tokens. Only real\n", - "# tokens are attended to.\n", - "input_mask = [1] * len(input_ids)\n", - "print(input_mask)" - ] - }, - { - "cell_type": "code", - "execution_count": 209, - "metadata": {}, - "outputs": [], - "source": [ - "# Zero-pad up to the sequence length.\n", - "while len(input_ids) < max_seq_length:\n", - " input_ids.append(0)\n", - " input_mask.append(0)\n", - " segment_ids.append(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 210, - "metadata": {}, - "outputs": [], - "source": [ - "assert len(input_ids) == max_seq_length\n", - "assert len(input_mask) == max_seq_length\n", - "assert len(segment_ids) == max_seq_length" - ] - }, - { - "cell_type": "code", - "execution_count": 213, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 213, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "label_id = label_map[example.label]\n", - "label_id" - ] - }, - { - "cell_type": "code", - "execution_count": 216, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:tensorflow:*** Example ***\n", - "INFO:tensorflow:guid: train-6\n", - "INFO:tensorflow:tokens\" [CLS] X X X/SL_ 고객/NNG_ 님/XSN_ 항상/MAG_ X X X/SL_ 은행/NNG_ 모 란/NNG_ 역/NNG_ 지점/NNG_ 을/JKO_ 이용/NNG_ 하/XSV_ 어/EC_ 주/VX_ 시/EP_ 는/ETM_ 고객/NNG_ 님/XSN_ 께/JKB_ 감사/NNG_ 의/JKG_ 마음/NNG_ 을/JKO_ 전하/VV_ ㅂ니다/EF_ ./SF_ 혹시/MAG_ 업무/NNG_ 와/JKB_ 관련/NNG_ 하/XSV_ 어/EC_ 궁금하/VA_ ㄴ/ETM_ 점/NNG_ 이/JKS_ 있/VA_ 으시/EP_ 면/EC_ 이/MM_ 번호/NNG_ 로/JKB_ 연락/NNG_ 주/VV_ 시/EP_ 기/ETN_ 바라/VV_ ㅂ니다/EF_ ./SF_ 성 심 껏/MAG_ 돕/VV_ 아/EC_ 드리/VX_ 겠/EP_ 습니다/EF_ ./SF_ 또/MAG_ 혹시/MAG_ 고객/NNG_ 만족/NNG_ 도/NNG_ 조사/NNG_ 전화/NNG_ 를/JKO_ 받/VV_ 으시/EP_ 면/EC_ 매우/MAG_ 동의/NNG_ 하/XSV_ ㄴ다/EF_ 로/JKB_ 칭찬/NNG_ 하/XSV_ 어/EC_ 주/VX_ 시/EP_ 어요/EF_ 조금/NNG_ 은/JX_ 쌀 쌀 하/VA_ ㄴ/ETM_ 10/SN_ 월/NNB_ 의/JKG_ 첫 주/NNG_ 이/VCP_ ㅂ니다/EF_ ./SF_ 환 절/NNG_ 기/XSN_ 감기/NNG_ 조심/NNG_ 하/XSV_ 시/EP_ 고/EC_ 따 듯하/VA_ ㄴ/ETM_ 차/NNG_ 와/JC_ 함께/MAG_ 건강/NNG_ 하/XSA_ ㄴ/ETM_ 한/MM_ 주/NNB_ 보내/VV_ 시/EP_ 기/ETN_ 바라/VV_ ㅂ니다/EF_ ./SF_ X X [SEP]\n", - "INFO:tensorflow:input_ids: 2 3047 3047 1496 1291 1123 2547 3047 3047 1496 994 315 1692 375 3277 11 456 9 20 129 388 22 1291 1123 3353 1308 13 588 11 276 158 7 5865 1579 101 266 9 20 4511 10 187 16 38 4506 71 80 1883 31 2597 359 388 49 2019 158 7 270 855 5181 2544 62 4971 124 116 7 179 5865 1291 2379 356 268 823 19 78 4506 71 1210 2680 9 41 31 3998 9 20 129 388 526 4380 21 9212 9212 248 10 113 60 13 4508 211 15 158 7 1807 1509 284 9869 4315 9 388 23 2577 9115 10 208 56 162 1394 42 10 92 2227 561 388 49 2019 158 7 3047 3047 3\n", - "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", - "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", - "INFO:tensorflow:label: 0 (id = 0)\n" - ] - } - ], - "source": [ - "tf.logging.info('*** Example ***')\n", - "tf.logging.info('guid: %s' % (example.guid))\n", - "tf.logging.info('tokens\" %s' % \" \".join(\n", - " [printable_text(x) for x in tokens]))\n", - "tf.logging.info('input_ids: %s' % \" \".join([str(x) for x in input_ids]))\n", - "tf.logging.info('input_mask: %s' % \" \".join([str(x) for x in input_mask]))\n", - "tf.logging.info('segment_ids: %s' % \" \".join([str(x) for x in segment_ids]))\n", - "tf.logging.info('label: %s (id = %d)' % (example.label, label_id))" - ] - }, - { - "cell_type": "code", - "execution_count": 217, - "metadata": {}, - "outputs": [], - "source": [ - "feature = InputFeatures(\n", - " input_ids=input_ids,\n", - " input_mask=input_mask,\n", - " segment_ids=segment_ids,\n", - " label_id=label_id,\n", - " is_real_example=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 223, - "metadata": {}, - "outputs": [], - "source": [ - "def create_int_feature(values):\n", - " f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))\n", - " return f" - ] - }, - { - "cell_type": "code", - "execution_count": 227, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OrderedDict([('input_ids', int64_list {\n", - " value: 2\n", - " value: 3047\n", - " value: 3047\n", - " value: 1496\n", - " value: 1291\n", - " value: 1123\n", - " value: 2547\n", - " value: 3047\n", - " value: 3047\n", - " value: 1496\n", - " value: 994\n", - " value: 315\n", - " value: 1692\n", - " value: 375\n", - " value: 3277\n", - " value: 11\n", - " value: 456\n", - " value: 9\n", - " value: 20\n", - " value: 129\n", - " value: 388\n", - " value: 22\n", - " value: 1291\n", - " value: 1123\n", - " value: 3353\n", - " value: 1308\n", - " value: 13\n", - " value: 588\n", - " value: 11\n", - " value: 276\n", - " value: 158\n", - " value: 7\n", - " value: 5865\n", - " value: 1579\n", - " value: 101\n", - " value: 266\n", - " value: 9\n", - " value: 20\n", - " value: 4511\n", - " value: 10\n", - " value: 187\n", - " value: 16\n", - " value: 38\n", - " value: 4506\n", - " value: 71\n", - " value: 80\n", - " value: 1883\n", - " value: 31\n", - " value: 2597\n", - " value: 359\n", - " value: 388\n", - " value: 49\n", - " value: 2019\n", - " value: 158\n", - " value: 7\n", - " value: 270\n", - " value: 855\n", - " value: 5181\n", - " value: 2544\n", - " value: 62\n", - " value: 4971\n", - " value: 124\n", - " value: 116\n", - " value: 7\n", - " value: 179\n", - " value: 5865\n", - " value: 1291\n", - " value: 2379\n", - " value: 356\n", - " value: 268\n", - " value: 823\n", - " value: 19\n", - " value: 78\n", - " value: 4506\n", - " value: 71\n", - " value: 1210\n", - " value: 2680\n", - " value: 9\n", - " value: 41\n", - " value: 31\n", - " value: 3998\n", - " value: 9\n", - " value: 20\n", - " value: 129\n", - " value: 388\n", - " value: 526\n", - " value: 4380\n", - " value: 21\n", - " value: 9212\n", - " value: 9212\n", - " value: 248\n", - " value: 10\n", - " value: 113\n", - " value: 60\n", - " value: 13\n", - " value: 4508\n", - " value: 211\n", - " value: 15\n", - " value: 158\n", - " value: 7\n", - " value: 1807\n", - " value: 1509\n", - " value: 284\n", - " value: 9869\n", - " value: 4315\n", - " value: 9\n", - " value: 388\n", - " value: 23\n", - " value: 2577\n", - " value: 9115\n", - " value: 10\n", - " value: 208\n", - " value: 56\n", - " value: 162\n", - " value: 1394\n", - " value: 42\n", - " value: 10\n", - " value: 92\n", - " value: 2227\n", - " value: 561\n", - " value: 388\n", - " value: 49\n", - " value: 2019\n", - " value: 158\n", - " value: 7\n", - " value: 3047\n", - " value: 3047\n", - " value: 3\n", - "}\n", - "), ('input_mask', int64_list {\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - "}\n", - "), ('segment_ids', int64_list {\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - "}\n", - "), ('label_ids', int64_list {\n", - " value: 0\n", - "}\n", - "), ('is_real_example', int64_list {\n", - " value: 1\n", - "}\n", - ")])\n" - ] - } - ], - "source": [ - "features = collections.OrderedDict()\n", - "\n", - "features['input_ids'] = create_int_feature(feature.input_ids)\n", - "features['input_mask'] = create_int_feature(feature.input_mask)\n", - "features['input_ids'] = create_int_feature(feature.input_ids)\n", - "features['segment_ids'] = create_int_feature(feature.segment_ids)\n", - "features['label_ids'] = create_int_feature([feature.label_id])\n", - "features['is_real_example'] = create_int_feature([int(feature.is_real_example)])\n", - "\n", - "print(features)" - ] - }, - { - "cell_type": "code", - "execution_count": 228, - "metadata": {}, - "outputs": [], - "source": [ - "tf_example = tf.train.Example(features=tf.train.Features(feature=features))" - ] - }, - { - "cell_type": "code", - "execution_count": 229, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "features {\n", - " feature {\n", - " key: \"input_ids\"\n", - " value {\n", - " int64_list {\n", - " value: 2\n", - " value: 3047\n", - " value: 3047\n", - " value: 1496\n", - " value: 1291\n", - " value: 1123\n", - " value: 2547\n", - " value: 3047\n", - " value: 3047\n", - " value: 1496\n", - " value: 994\n", - " value: 315\n", - " value: 1692\n", - " value: 375\n", - " value: 3277\n", - " value: 11\n", - " value: 456\n", - " value: 9\n", - " value: 20\n", - " value: 129\n", - " value: 388\n", - " value: 22\n", - " value: 1291\n", - " value: 1123\n", - " value: 3353\n", - " value: 1308\n", - " value: 13\n", - " value: 588\n", - " value: 11\n", - " value: 276\n", - " value: 158\n", - " value: 7\n", - " value: 5865\n", - " value: 1579\n", - " value: 101\n", - " value: 266\n", - " value: 9\n", - " value: 20\n", - " value: 4511\n", - " value: 10\n", - " value: 187\n", - " value: 16\n", - " value: 38\n", - " value: 4506\n", - " value: 71\n", - " value: 80\n", - " value: 1883\n", - " value: 31\n", - " value: 2597\n", - " value: 359\n", - " value: 388\n", - " value: 49\n", - " value: 2019\n", - " value: 158\n", - " value: 7\n", - " value: 270\n", - " value: 855\n", - " value: 5181\n", - " value: 2544\n", - " value: 62\n", - " value: 4971\n", - " value: 124\n", - " value: 116\n", - " value: 7\n", - " value: 179\n", - " value: 5865\n", - " value: 1291\n", - " value: 2379\n", - " value: 356\n", - " value: 268\n", - " value: 823\n", - " value: 19\n", - " value: 78\n", - " value: 4506\n", - " value: 71\n", - " value: 1210\n", - " value: 2680\n", - " value: 9\n", - " value: 41\n", - " value: 31\n", - " value: 3998\n", - " value: 9\n", - " value: 20\n", - " value: 129\n", - " value: 388\n", - " value: 526\n", - " value: 4380\n", - " value: 21\n", - " value: 9212\n", - " value: 9212\n", - " value: 248\n", - " value: 10\n", - " value: 113\n", - " value: 60\n", - " value: 13\n", - " value: 4508\n", - " value: 211\n", - " value: 15\n", - " value: 158\n", - " value: 7\n", - " value: 1807\n", - " value: 1509\n", - " value: 284\n", - " value: 9869\n", - " value: 4315\n", - " value: 9\n", - " value: 388\n", - " value: 23\n", - " value: 2577\n", - " value: 9115\n", - " value: 10\n", - " value: 208\n", - " value: 56\n", - " value: 162\n", - " value: 1394\n", - " value: 42\n", - " value: 10\n", - " value: 92\n", - " value: 2227\n", - " value: 561\n", - " value: 388\n", - " value: 49\n", - " value: 2019\n", - " value: 158\n", - " value: 7\n", - " value: 3047\n", - " value: 3047\n", - " value: 3\n", - " }\n", - " }\n", - " }\n", - " feature {\n", - " key: \"input_mask\"\n", - " value {\n", - " int64_list {\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " value: 1\n", - " }\n", - " }\n", - " }\n", - " feature {\n", - " key: \"is_real_example\"\n", - " value {\n", - " int64_list {\n", - " value: 1\n", - " }\n", - " }\n", - " }\n", - " feature {\n", - " key: \"label_ids\"\n", - " value {\n", - " int64_list {\n", - " value: 0\n", - " }\n", - " }\n", - " }\n", - " feature {\n", - " key: \"segment_ids\"\n", - " value {\n", - " int64_list {\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " value: 0\n", - " }\n", - " }\n", - " }\n", - "}" - ] - }, - "execution_count": 229, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tf_example" - ] - }, - { - "cell_type": "code", - "execution_count": 230, - "metadata": {}, - "outputs": [], - "source": [ - "writer.write(tf_example.SerializeToString())" - ] - }, - { - "cell_type": "code", - "execution_count": 231, - "metadata": {}, - "outputs": [], - "source": [ - "writer.close()" - ] - }, - { - "cell_type": "code", - "execution_count": 232, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'./output_dir/smishing/train.tf_record'" - ] - }, - "execution_count": 232, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_file" - ] - }, - { - "cell_type": "code", - "execution_count": 233, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:tensorflow:***** Running training *****\n", - "INFO:tensorflow: Num examples = 100\n", - "INFO:tensorflow: Batch size = 32\n", - "INFO:tensorflow: Num steps = 9\n" - ] - } - ], - "source": [ - "tf.logging.info(\"***** Running training *****\")\n", - "tf.logging.info(\" Num examples = %d\", len(train_examples))\n", - "tf.logging.info(\" Batch size = %d\", FLAGS.train_batch_size)\n", - "tf.logging.info(\" Num steps = %d\", num_train_steps)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Spacing안한 전체 데이터로 돌려보기!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "1. 데이터 준비" - ] - }, - { - "cell_type": "code", - "execution_count": 236, - "metadata": {}, - "outputs": [], - "source": [ - "processor = SmishingProcessor()\n", - "label_list = processor.get_labels()\n", - "\n", - "# get train samples\n", - "train_examples = processor.get_train_examples(dacon_path, 'train.tsv')\n", - "num_train_steps = int(\n", - " len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)\n", - "num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)" - ] - }, - { - "cell_type": "code", - "execution_count": 237, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:Estimator's model_fn (.model_fn at 0x00000219A36F36A8>) includes params argument, but params are not passed to Estimator.\n", - "WARNING:tensorflow:Using temporary folder as model directory: C:\\Users\\jinma\\AppData\\Local\\Temp\\tmpafvhq326\n", - "INFO:tensorflow:Using config: {'_model_dir': 'C:\\\\Users\\\\jinma\\\\AppData\\\\Local\\\\Temp\\\\tmpafvhq326', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n", - "graph_options {\n", - " rewrite_options {\n", - " meta_optimizer_iterations: ONE\n", - " }\n", - "}\n", - ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None, eval_training_input_configuration=2, experimental_host_call_every_n_steps=1), '_cluster': None}\n", - "INFO:tensorflow:_TPUContext: eval_on_tpu True\n", - "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n" - ] - } - ], - "source": [ - "# record ETRI model weights\n", - "FLAGS.init_checkpoint = path + 'model.ckpt'\n", - "\n", - "model_fn = model_fn_builder(\n", - " bert_config=bert_config,\n", - " num_labels=len(label_list), # 2\n", - " init_checkpoint=FLAGS.init_checkpoint, # None\n", - " learning_rate=FLAGS.learning_rate, # 5e-05\n", - " num_train_steps=num_train_steps, # 22195\n", - " num_warmup_steps=num_warmup_steps, # 2219\n", - " use_tpu=FLAGS.use_tpu, # False\n", - " use_one_hot_embeddings=FLAGS.use_tpu) # False\n", - "\n", - "# If TPU is not available, this will fall back to normal Estimator on CPU\n", - "# or GPU\n", - "estimator = tf.contrib.tpu.TPUEstimator(\n", - " use_tpu=FLAGS.use_tpu, # False\n", - " model_fn=model_fn,\n", - " config=run_config,\n", - " train_batch_size=FLAGS.train_batch_size, # 32\n", - " eval_batch_size=FLAGS.eval_batch_size, # 8\n", - " predict_batch_size=FLAGS.predict_batch_size # 8\n", - ")\n", - "\n", - "FLAGS.output_dir = './output_dir/smishing/'\n", - "\n", - "tf.gfile.MakeDirs(FLAGS.output_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": 240, - "metadata": {}, - "outputs": [], - "source": [ - "def file_based_convert_examples_to_features(\n", - " examples, label_lsit, max_seq_length, tokenizer, output_file):\n", - " \n", - " writer = tf.python_io.TFRecordWriter(output_file)\n", - "\n", - " for (ex_index, example) in enumerate(examples):\n", - " if ex_index % 10000 == 0:\n", - " tf.logging.info(\"Writing example %d of %d\" % (ex_index, len(examples)))\n", - "\n", - " feature = convert_single_example(ex_index, example, label_list,\n", - " max_seq_length, tokenizer)\n", - "\n", - " def create_int_feature(values):\n", - " f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))\n", - " return f\n", - "\n", - " features = collections.OrderedDict()\n", - " features[\"input_ids\"] = create_int_feature(feature.input_ids)\n", - " features[\"input_mask\"] = create_int_feature(feature.input_mask)\n", - " features[\"segment_ids\"] = create_int_feature(feature.segment_ids)\n", - " features[\"label_ids\"] = create_int_feature([feature.label_id])\n", - " features[\"is_real_example\"] = create_int_feature(\n", - " [int(feature.is_real_example)])\n", - "\n", - " tf_example = tf.train.Example(features=tf.train.Features(feature=features))\n", - " writer.write(tf_example.SerializeToString())\n", - " writer.close()" - ] - }, - { - "cell_type": "code", - "execution_count": 244, - "metadata": {}, - "outputs": [], - "source": [ - "train_file = os.path.join(FLAGS.output_dir, 'train_non_spacing.tf_record')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file_based_convert_examples_to_features" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "basic", - "language": "python", - "name": "basic" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/src/advanced_transformers/__init__.py b/src/advanced_transformers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/activations.py b/src/advanced_transformers/activations.py new file mode 100644 index 0000000..d9ba0c6 --- /dev/null +++ b/src/advanced_transformers/activations.py @@ -0,0 +1,416 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math + +import torch +from packaging import version +from torch import Tensor, nn + +from .quant_modules import ( + symmetric_linear_quantization_params, + SymmetricQuantFunction, + floor_ste, + FixedPointMul, +) + + +# Copied from transformers.activations.NewGELUActivation +class NewGELUActivation(nn.Module): + """ + Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see + the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) + + +# Copied from transformers.activations.GELUActivation +class GELUActivation(nn.Module): + """ + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional + Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, use_gelu_python: bool = False): + super().__init__() + if version.parse(torch.__version__) < version.parse("1.4") or use_gelu_python: + self.act = self._gelu_python + else: + self.act = nn.functional.gelu + + def _gelu_python(self, input: Tensor) -> Tensor: + return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0))) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +# Copied from transformers.activations.FastGELUActivation +class FastGELUActivation(nn.Module): + """ + Applies GELU approximation that is slower than QuickGELU but more accurate. + See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))) + + +# Copied from transformers.activations.QuickGELUActivation +class QuickGELUActivation(nn.Module): + """ + Applies GELU approximation that is fast but somewhat inaccurate. + See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return input * torch.sigmoid(1.702 * input) + + +# Copied from transformers.activations.ClippedGELUActivation +class ClippedGELUActivation(nn.Module): + """ + Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as + it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to + https://arxiv.org/abs/2004.09602. + Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when + initially created. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, min: float, max: float): + if min > max: + raise ValueError(f"min should be < max (got min: {min}, max: {max})") + + super().__init__() + self.min = min + self.max = max + + def forward(self, x: Tensor) -> Tensor: + return torch.clip(gelu(x), self.min, self.max) + + +# Copied from transformers.models.bloom.modeling_bloom.bloom_gelu_forward +def bloom_gelu_forward(x): + """ + Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to + make the model jitable. + Args: + x (`torch.tensor`, *required*): + input hidden states + """ + return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + + +# Copied from transformers.models.bloom.modeling_bloom.bloom_gelu_back +def bloom_gelu_back(g, x): + """ + gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) + + 0.3989423 * x * torch.exp(-0.5 * x * x) + Args: + g (`torch.tensor`, *required*): + gradient output tensor + x (`torch.tensor`, *required*): + input tensor + """ + x = x[0] # x is a tuple of 1 element, needs to unpack it first + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) + return ff * g + + +# Copied from transformers.models.bloom.modeling_bloom.GeLUFunction +class GeLUFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, input): + ctx.save_for_backward(input) + return bloom_gelu_forward(input) + + @staticmethod + def backward(ctx, grad_output): + input = ctx.saved_tensors + tmp = bloom_gelu_back(grad_output, input) + return tmp + + +# Copied from transformers.models.bloom.modeling_bloom.BloomGelu +class BloomGELUActivation(nn.Module): + """ + BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model + torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly + copied from Megatron-DeepSpeed code and adapted for our needs + See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329 + """ + + def __init__(self): + super().__init__() + + def forward(self, x): + if self.training: + return GeLUFunction.apply(x) + else: + bloom_gelu_forward(x) + + +# Inspired by transformers.models.ibert.quant_modules.IntGELU +class IntGELU(nn.Module): + """ + Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`. + Args: + quant_mode (`bool`, *optional*, defaults to `False`): + Whether or not the layer is quantized. + """ + + def __init__(self, quant_mode=True): + super().__init__() + self.quant_mode = quant_mode + + if not self.quant_mode: + self.activation_fn = nn.GELU() + + self.k = 1.4142 + self.const = 14 # dummy integer constant + self.coeff = [-0.2888, -1.769, 1] # a(x+b)**2 + c + self.coeff[2] /= self.coeff[0] + + def int_erf(self, x_int, scaling_factor): + b_int = torch.floor(self.coeff[1] / scaling_factor) + c_int = torch.floor(self.coeff[2] / scaling_factor**2) + sign = torch.sign(x_int) + + abs_int = torch.min(torch.abs(x_int), -b_int) + y_int = sign * ((abs_int + b_int) ** 2 + c_int) + scaling_factor = scaling_factor**2 * self.coeff[0] + + # avoid overflow + y_int = floor_ste.apply(y_int / 2**self.const) + scaling_factor = scaling_factor * 2**self.const + + return y_int, scaling_factor + + def forward(self, x, scaling_factor=None): + if not self.quant_mode: + return self.activation_fn(x), None + + x_int = x / scaling_factor + sigmoid_int, sigmoid_scaling_factor = self.int_erf(x_int, scaling_factor / self.k) + + shift_int = 1.0 // sigmoid_scaling_factor + + x_int = x_int * (sigmoid_int + shift_int) + scaling_factor = scaling_factor * sigmoid_scaling_factor / 2 + + return x_int * scaling_factor, scaling_factor + + +# Copied from transformers.models.ibert.quant_modules.QuantAct +class QuantAct(nn.Module): + """ + Quantizes the given activation. + Args: + activation_bit (`int`): + Bitwidth for the quantized activation. + act_range_momentum (`float`, *optional*, defaults to `0.95`): + Momentum for updating the activation quantization range. + per_channel (`bool`, *optional*, defaults to `False`): + Whether to or not use channel-wise quantization. + channel_len (`int`, *optional*): + Specify the channel length when set the *per_channel* True. + quant_mode (`bool`, *optional*, defaults to `False`): + Whether or not the layer is quantized. + """ + + def __init__( + self, + activation_bit, + act_range_momentum=0.95, + per_channel=False, + channel_len=None, + quant_mode=False + ): + super().__init__() + + self.activation_bit = activation_bit + self.act_range_momentum = act_range_momentum + self.quant_mode = quant_mode + self.per_channel = per_channel + self.percentile = False + self.act_function = SymmetricQuantFunction.apply + + if not self.per_channel: + self.register_buffer("x_min", torch.zeros(1)) + self.register_buffer("x_max", torch.zeros(1)) + self.register_buffer("act_scaling_factor", torch.zeros(1)) + self.x_min -= 1e-5 + self.x_max += 1e-5 + else: + raise NotImplementedError("per-channel mode is not currently supported for activation.") + + def __repr__(self): + return ( + f"{self.__class__.__name__}(activation_bit={self.activation_bit}, " + f"quant_mode: {self.quant_mode}, Act_min: {self.x_min.item():.2f}, " + f"Act_max: {self.x_max.item():.2f})" + ) + + def forward( + self, + x, + pre_act_scaling_factor=None, + identity=None, + identity_scaling_factor=None, + specified_min=None, + specified_max=None, + ): + + x_act = x if identity is None else identity + x + # collect running stats if training + if self.training: + assert not self.percentile, "percentile mode is not currently supported for activation." + assert not self.per_channel, "per-channel mode is not currently supported for activation." + x_min = x_act.data.min() + x_max = x_act.data.max() + + assert ( + x_max.isnan().sum() == 0 and x_min.isnan().sum() == 0 + ), "NaN detected when computing min/max of the activation" + + # Initialization + if self.x_min.min() > -1.1e-5 and self.x_max.max() < 1.1e-5: + self.x_min = self.x_min + x_min + self.x_max = self.x_max + x_max + + # exponential moving average (EMA) + # use momentum to prevent the quantized values change greatly every iteration + elif self.act_range_momentum == -1: + self.x_min = torch.min(self.x_min, x_min) + self.x_max = torch.max(self.x_max, x_max) + else: + self.x_min = self.x_min * self.act_range_momentum + x_min * (1 - self.act_range_momentum) + self.x_max = self.x_max * self.act_range_momentum + x_max * (1 - self.act_range_momentum) + + if not self.quant_mode: + return x_act, None + + x_min = self.x_min if specified_min is None else specified_min + x_max = self.x_max if specified_max is None else specified_max + + self.act_scaling_factor = symmetric_linear_quantization_params( + self.activation_bit, x_min, x_max, per_channel=self.per_channel + ) + + if pre_act_scaling_factor is None: + # this is for the input quantization + quant_act_int = self.act_function(x, self.activation_bit, self.percentile, self.act_scaling_factor) + else: + quant_act_int = FixedPointMul.apply( + x, + pre_act_scaling_factor, + self.activation_bit, + self.act_scaling_factor, + identity, + identity_scaling_factor, + ) + + correct_output_scale = self.act_scaling_factor.view(-1) + + return quant_act_int * correct_output_scale, self.act_scaling_factor + + +# Copied from transformers.activations.SiLUActivation +class SiLUActivation(nn.Module): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + + def __init__(self): + super().__init__() + if version.parse(torch.__version__) < version.parse("1.7"): + self.act = self._silu_python + else: + self.act = nn.functional.silu + + def _silu_python(self, input: Tensor) -> Tensor: + return input * torch.sigmoid(input) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +# Copied from transformers.activations.MishActivation +class MishActivation(nn.Module): + """ + See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also + visit the official repository for the paper: https://github.com/digantamisra98/Mish + """ + + def __init__(self): + super().__init__() + if version.parse(torch.__version__) < version.parse("1.9"): + self.act = self._mish_python + else: + self.act = nn.functional.mish + + def _mish_python(self, input: Tensor) -> Tensor: + return input * torch.tanh(nn.functional.softplus(input)) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +# Copied from transformers.activations.LinearActivation +class LinearActivation(nn.Module): + """ + Applies the linear activation function, i.e. forwarding input directly to output. + """ + + def forward(self, input: Tensor) -> Tensor: + return input + + +ACT2FN = { + "gelu": GELUActivation(), + "gelu_10": ClippedGELUActivation(-10, 10), + "gelu_fast": FastGELUActivation(), + "gelu_new": NewGELUActivation(), + "gelu_python": GELUActivation(use_gelu_python=True), + "linear": LinearActivation(), + "mish": MishActivation(), + "quick_gelu": QuickGELUActivation(), + "gelu_bloom": BloomGELUActivation(), + "relu": nn.ReLU(), + "sigmoid": nn.Sigmoid(), + "silu": SiLUActivation(), + "swish": SiLUActivation(), + "tanh": nn.Tanh(), + "gelu_int": IntGELU(quant_mode=False), + "gelu_int_quant": IntGELU(), + "act_8": QuantAct(activation_bit=8), + "act_8_quant": QuantAct(activation_bit=8, quant_mode=True), + "act_16": QuantAct(activation_bit=16), + "act_16_quant": QuantAct(activation_bit=16, quant_mode=True), + "act_22": QuantAct(activation_bit=22), + "act_22_quant": QuantAct(activation_bit=22, quant_mode=True), +} \ No newline at end of file diff --git a/src/advanced_transformers/processors/__init__.py b/src/advanced_transformers/processors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/quant_modules.py b/src/advanced_transformers/quant_modules.py new file mode 100644 index 0000000..041ff2e --- /dev/null +++ b/src/advanced_transformers/quant_modules.py @@ -0,0 +1,281 @@ +# coding=utf-8 +# Copyright 2021 The I-BERT Authors (Sehoon Kim, Amir Gholami, Zhewei Yao, +# Michael Mahoney, Kurt Keutzer - UC Berkeley) and The HuggingFace Inc. team. +# Copyright (c) 20121, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import decimal + +import numpy as np +import torch +from torch import nn +from torch.autograd import Function + + +# Copied from transformers.models.ibert.quant_modules.linear_quantize +def linear_quantize(input, scale, zero_point, inplace=False): + """ + Quantize single-precision input tensor to integers with the given scaling factor and zeropoint. + Args: + input (`torch.Tensor`): + Single-precision input tensor to be quantized. + scale (`torch.Tensor`): + Scaling factor for quantization. + zero_pint (`torch.Tensor`): + Shift for quantization. + inplace (`bool`, *optional*, defaults to `False`): + Whether to compute inplace or not. + Returns: + `torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*. + """ + # reshape scale and zeropoint for convolutional weights and activation + if len(input.shape) == 4: + scale = scale.view(-1, 1, 1, 1) + zero_point = zero_point.view(-1, 1, 1, 1) + # reshape scale and zeropoint for linear weights + elif len(input.shape) == 2: + scale = scale.view(-1, 1) + zero_point = zero_point.view(-1, 1) + else: + scale = scale.view(-1) + zero_point = zero_point.view(-1) + # quantized = float / scale + zero_point + if inplace: + input.mul_(1.0 / scale).add_(zero_point).round_() + return input + return torch.round(1.0 / scale * input + zero_point) + + +# Copied from transformers.models.ibert.quant_modules.symmetric_linear_quantization_params +def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_max, per_channel=False): + """ + Compute the scaling factor with the given quantization range for symmetric quantization. + Args: + saturation_min (`torch.Tensor`): + Lower bound for quantization range. + saturation_max (`torch.Tensor`): + Upper bound for quantization range. + per_channel (`bool`, *optional*, defaults to `False`): + Whether to or not use channel-wise quantization. + Returns: + `torch.Tensor`: Scaling factor that linearly quantizes the given range between *saturation_min* and + *saturation_max*. + """ + # in this part, we do not need any gradient computation, + # in order to enforce this, we put torch.no_grad() + with torch.no_grad(): + n = 2 ** (num_bits - 1) - 1 + + if per_channel: + scale, _ = torch.max(torch.stack([saturation_min.abs(), saturation_max.abs()], dim=1), dim=1) + scale = torch.clamp(scale, min=1e-8) / n + + else: + scale = max(saturation_min.abs(), saturation_max.abs()) + scale = torch.clamp(scale, min=1e-8) / n + + return scale + + +# Copied from transformers.models.ibert.quant_modules.SymmetricQuantFunction +class SymmetricQuantFunction(Function): + """ + Class to quantize the given floating-point values using symmetric quantization with given range and bitwidth. + """ + + @staticmethod + def forward(ctx, x, k, percentile_mode, scale): + """ + Args: + x (`torch.Tensor`): + Floating point tensor to be quantized. + k (`int`): + Quantization bitwidth. + percentile_mode (`bool`): + Whether or not to use percentile calibration. + scale (`torch.Tensor`): + Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction + requires pre-calculated scaling factor. + Returns: + `torch.Tensor`: Symmetric-quantized value of *input*. + """ + zero_point = torch.tensor(0.0).to(scale.device) + + n = 2 ** (k - 1) - 1 + new_quant_x = linear_quantize(x, scale, zero_point, inplace=False) + new_quant_x = torch.clamp(new_quant_x, -n, n - 1) + + ctx.scale = scale + return new_quant_x + + @staticmethod + def backward(ctx, grad_output): + + scale = ctx.scale + if len(grad_output.shape) == 4: + scale = scale.view(-1, 1, 1, 1) + # reshape scale and zeropoint for linear weights + elif len(grad_output.shape) == 2: + scale = scale.view(-1, 1) + else: + scale = scale.view(-1) + + return grad_output.clone() / scale, None, None, None, None + + +# Copied from transformers.models.ibert.quant_modules.floor_ste +class floor_ste(Function): + """ + Straight-through Estimator(STE) for torch.floor() + """ + + @staticmethod + def forward(ctx, x): + return torch.floor(x) + + @staticmethod + def backward(ctx, grad_output): + return grad_output.clone() + + +# Copied from transformers.models.ibert.quant_modules.round_ste +class round_ste(Function): + """ + Straight-through Estimator(STE) for torch.round() + """ + + @staticmethod + def forward(ctx, x): + return torch.round(x) + + @staticmethod + def backward(ctx, grad_output): + return grad_output.clone() + + +# Copied from transformers.models.ibert.quant_modules.batch_frexp +def batch_frexp(inputs, max_bit=31): + """ + Decompose the scaling factor into mantissa and twos exponent. + Args: + scaling_factor (`torch.Tensor`): + Target scaling factor to decompose. + Returns: + ``Tuple(torch.Tensor, torch.Tensor)`: mantisa and exponent + """ + + shape_of_input = inputs.size() + + # trans the input to be a 1-d tensor + inputs = inputs.view(-1) + + output_m, output_e = np.frexp(inputs.cpu().numpy()) + tmp_m = [] + for m in output_m: + int_m_shifted = int( + decimal.Decimal(m * (2**max_bit)).quantize(decimal.Decimal("1"), rounding=decimal.ROUND_HALF_UP) + ) + tmp_m.append(int_m_shifted) + output_m = np.array(tmp_m) + + output_e = float(max_bit) - output_e + + return ( + torch.from_numpy(output_m).to(inputs.device).view(shape_of_input), + torch.from_numpy(output_e).to(inputs.device).view(shape_of_input), + ) + + +# Copied from transformers.models.ibert.quant_modules.FixedPointMul +class FixedPointMul(Function): + """ + Function to perform fixed-point arithmetic that can match integer arithmetic on hardware. + Args: + pre_act (`torch.Tensor`): + Input tensor. + pre_act_scaling_factor (`torch.Tensor`): + Scaling factor of the input tensor *pre_act*. + bit_num (`int`): + Quantization bitwidth. + z_scaling_factor (`torch.Tensor`): + Scaling factor of the output tensor. + identity (`torch.Tensor`, *optional*): + Identity tensor, if exists. + identity_scaling_factor (`torch.Tensor`, *optional*): + Scaling factor of the identity tensor *identity*, if exists. + Returns: + `torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act* and + *identity*), whose scale is rescaled to *z_scaling_factor*. + """ + + @staticmethod + def forward( + ctx, + pre_act, + pre_act_scaling_factor, + bit_num, + z_scaling_factor, + identity=None, + identity_scaling_factor=None, + ): + + if len(pre_act_scaling_factor.shape) == 3: + reshape = lambda x: x # noqa: E731 + else: + reshape = lambda x: x.view(1, 1, -1) # noqa: E731 + ctx.identity = identity + + n = 2 ** (bit_num - 1) - 1 + + with torch.no_grad(): + pre_act_scaling_factor = reshape(pre_act_scaling_factor) + if identity is not None: + identity_scaling_factor = reshape(identity_scaling_factor) + + ctx.z_scaling_factor = z_scaling_factor + + z_int = torch.round(pre_act / pre_act_scaling_factor) + _A = pre_act_scaling_factor.type(torch.double) + _B = (z_scaling_factor.type(torch.float)).type(torch.double) + new_scale = _A / _B + new_scale = reshape(new_scale) + + m, e = batch_frexp(new_scale) + + output = z_int.type(torch.double) * m.type(torch.double) + output = torch.round(output / (2.0**e)) + + if identity is not None: + # needs addition of identity activation + wx_int = torch.round(identity / identity_scaling_factor) + + _A = identity_scaling_factor.type(torch.double) + _B = (z_scaling_factor.type(torch.float)).type(torch.double) + new_scale = _A / _B + new_scale = reshape(new_scale) + + m1, e1 = batch_frexp(new_scale) + output1 = wx_int.type(torch.double) * m1.type(torch.double) + output1 = torch.round(output1 / (2.0**e1)) + + output = output1 + output + + return torch.clamp(output.type(torch.float), -n - 1, n) + + @staticmethod + def backward(ctx, grad_output): + identity_grad = None + if ctx.identity is not None: + identity_grad = grad_output.clone() / ctx.z_scaling_factor + return grad_output.clone() / ctx.z_scaling_factor, None, None, None, None, identity_grad, None \ No newline at end of file diff --git a/src/advanced_transformers/retrievers/__init__.py b/src/advanced_transformers/retrievers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/__init__.py b/src/advanced_transformers/transformers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/__init__.py b/src/advanced_transformers/transformers/components/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/attentions/__init__.py b/src/advanced_transformers/transformers/components/attentions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/feed_forward_networks/__init__.py b/src/advanced_transformers/transformers/components/feed_forward_networks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/heads/__init__.py b/src/advanced_transformers/transformers/components/heads/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/layer_norms/__init__.py b/src/advanced_transformers/transformers/components/layer_norms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/__init__.py b/src/advanced_transformers/transformers/components/positional_embeddings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/attention_with_linear_biases.py b/src/advanced_transformers/transformers/components/positional_embeddings/attention_with_linear_biases.py new file mode 100644 index 0000000..5da5abe --- /dev/null +++ b/src/advanced_transformers/transformers/components/positional_embeddings/attention_with_linear_biases.py @@ -0,0 +1,60 @@ +# coding=utf-8 +# Copyright (c) Facebook, Inc. and its affiliates. + +import math +from typing import List + +import torch + + +def get_slopes(n: int) -> List[int]: + def get_slopes_power_of_2(n: int): + start = 2 ** (-(2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + # In the paper, we only train models that have 2^a heads for some a. + # This function has some good properties that only occur when the input is a power of 2. + # To maintain that even when the number of heads is not a power of 2, we use this workaround. + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] + ) + + +if __name__ == "__main__": + # __init__ + bsz = 32 + seq_len = 17 + max_tokens = 512 + maxpos = 512 # tokens_per_sample + attn_heads = 16 # decoder_attention_heads + slopes = torch.Tensor(get_slopes(attn_heads)) + # In the next line, the part after the * is what constructs the diagonal matrix + # (right matrix in Figure 3 in the paper). + # If you run it you'll see that it doesn't exactly print out the same matrix as we have in Figure 3, + # but one where all rows are identical. + # This works because the softmax operation is invariant to translation, + # and our bias functions are always linear. + m = slopes.unsqueeze(1).unsqueeze(1) # head-specific slope fixed + positions = ( + torch.arange(maxpos).unsqueeze(0).unsqueeze(0).expand(attn_heads, -1, -1) + ) + alibi = m * positions # non-learned bias + alibi = alibi.view(attn_heads, 1, maxpos) + alibi = alibi.repeat(max_tokens // maxpos, 1, 1) # batch_size, 1, 1 + # extract_features_scriptable + # we move the mask construction `before layer operation` because its slightly more efficient + # self_attn_mask = self.buffered_future_mask(x) + + def fill_with_neg_inf(t): + """FP16-compatible function that fills a tensor with -inf.""" + return t.float().fill_(float("-inf")).type_as(t) + + _future_mask = torch.triu(fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1) + _future_mask = _future_mask + alibi + _future_mask = _future_mask[: bsz * attn_heads, :seq_len, :seq_len] diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/axial_positional_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/axial_positional_embedding.py new file mode 100644 index 0000000..946d385 --- /dev/null +++ b/src/advanced_transformers/transformers/components/positional_embeddings/axial_positional_embedding.py @@ -0,0 +1,79 @@ +# coding=utf-8 +# Copyright @lucidrains +# ref. https://github.com/lucidrains/axial-positional-embedding + +import torch +from torch import nn +from operator import mul +from functools import reduce + + +class AxialPositionalEmbedding(nn.Module): + def __init__(self, dim, axial_shape, axial_dims = None): + super().__init__() + + self.dim = dim + self.shape = axial_shape + self.max_seq_len = reduce(mul, axial_shape, 1) + + self.summed = axial_dims is None + axial_dims = ((dim,) * len(axial_shape)) if self.summed else axial_dims + + assert len(self.shape) == len(axial_dims), 'number of axial dimensions must equal the number of dimensions in the shape' + assert self.summed or not self.summed and sum(axial_dims) == dim, f'axial dimensions must sum up to the target dimension {dim}' + + self.weights = ParameterList(self, 'weights', len(axial_shape)) + + for ind, (shape, axial_dim) in enumerate(zip(self.shape, axial_dims)): + ax_shape = [1] * len(self.shape) + ax_shape[ind] = shape + ax_shape = (1, *ax_shape, axial_dim) + ax_emb = nn.Parameter(torch.zeros(ax_shape).normal_(0, 1)) + self.weights.append(ax_emb) + + def forward(self, x): + b, t, e = x.shape + assert (t <= self.max_seq_len), f'Sequence length ({t}) must be less than the maximum sequence length allowed ({self.max_seq_len})' + embs = [] + + for ax_emb in self.weights.to_list(): + axial_dim = ax_emb.shape[-1] + expand_shape = (b, *self.shape, axial_dim) + emb = ax_emb.expand(expand_shape).reshape(b, self.max_seq_len, axial_dim) + embs.append(emb) + + pos_emb = sum(embs) if self.summed else torch.cat(embs, dim=-1) + return pos_emb[:, :t].to(x) + +# a mock parameter list object until below issue is resolved +# https://github.com/pytorch/pytorch/issues/36035 +class ParameterList(object): + def __init__(self, kls, prefix, length): + self.ind = 0 + self.kls = kls + self.prefix = prefix + self.length = length + + def _keyname(self, prefix, ind): + return f'{prefix}_{ind}' + + def append(self, x): + setattr(self.kls, self._keyname(self.prefix, self.ind), x) + self.ind += 1 + + def to_list(self): + return [getattr(self.kls, self._keyname(self.prefix, i)) for i in range(self.length)] + +# Axial Positional Embedding for Images + +class AxialPositionalEmbeddingImage(nn.Module): + def __init__(self, dim, axial_shape, axial_dims = None): + super().__init__() + assert len(axial_shape) == 2, 'Axial shape must have 2 dimensions for images' + self.pos_emb = AxialPositionalEmbedding(dim, axial_shape, axial_dims) + + def forward(self, img): + b, c, h, w = img.shape + img = img.permute(0, 2, 3, 1).reshape(b, h * w, c) + pos_emb = self.pos_emb(img) + return pos_emb.reshape(b, h, w, c).permute(0, 3, 1, 2) diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/relative_position_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/relative_position_embedding.py new file mode 100644 index 0000000..e69de29 diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/rotary_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/rotary_embedding.py new file mode 100644 index 0000000..537fe54 --- /dev/null +++ b/src/advanced_transformers/transformers/components/positional_embeddings/rotary_embedding.py @@ -0,0 +1,62 @@ +# coding=utf-8 +# Copyright @lucidrains +# ref. https://github.com/lucidrains/rotary-embedding-torch + +from inspect import isfunction + +import torch +from torch import nn, einsum +from einops import rearrange, repeat + + +class RotaryEmbedding(nn.Module): + def __init__(self, theta: int, dim: int, learned_freq: bool): + super().__init__() + self.theta = theta + self.dim = dim + self.learned_freq = learned_freq + freqs = 1.0 / (theta ** torch.arange(0, dim, 2)[: (dim // 2)].float() / dim) + self.cache = dict() + + if learned_freq: + self.freqs = nn.Parameter(freqs) + else: + self.register_buffer("freqs", freqs) + + def forward(self, t, cache_key=None): + if cache_key is not None and cache_key in self.cache: + return self.cache[cache_key] + + if isfunction(t): + t = t() + + freqs = self.freqs + + freqs = einsum("..., f -> ... f", t.type(freqs.dtype), freqs) + freqs = repeat(freqs, "... n -> ... (n r)", r=2) + + if cache_key is not None: + self.cache[cache_key] = freqs + + return freqs + + @staticmethod + def apply_rotary_emb(freqs, t, start_index=0): + rot_dim = freqs.shape[-1] + end_index = start_index + rot_dim + assert rot_dim <= t.shape[-1], ( + f"feature dimension {t.shape[-1]} is not of sufficient " + f"size to rotate in all the positions {rot_dim}" + ) + t_left = t[..., :start_index] + t = t[..., start_index:end_index] + t_right = t[..., end_index:] + + def rotary_half(x): + x = rearrange(x, "... (d r) -> ... d r", r=2) + x1, x2 = x.unbind(dim=-1) + x = torch.stack((-x2, x1), dim=-1) + return rearrange(x, "... d r -> ... (d r)") + + t = (t * freqs.cos()) + (rotary_half(t) * freqs.sin()) + return torch.cat((t_left, t, t_right), dim=-1) diff --git a/src/advanced_transformers/transformers/components/positional_embeddings/sinusoidal_positional_embedding.py b/src/advanced_transformers/transformers/components/positional_embeddings/sinusoidal_positional_embedding.py new file mode 100644 index 0000000..a33ea2e --- /dev/null +++ b/src/advanced_transformers/transformers/components/positional_embeddings/sinusoidal_positional_embedding.py @@ -0,0 +1,94 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. + +import math +from typing import Any, Optional + +import torch +from torch import Tensor, nn + + +class SinusoidalPositionalEmbedding(nn.Embedding): + """ + This module produces sinusoidal positional embeddings of any length. + We don't want to save the weight of this embedding since it's not trained (deterministic) + and it can be huge. Padding symbols are ignored. + These embeddings get automatically extended in forward if more positions is needed. + """ + + def __init__(self, num_positions, embedding_dim, padding_idx): + self.make_weight(num_positions, embedding_dim, padding_idx) + + def make_weight(self, num_positions, embedding_dim, padding_idx): + weight = self.get_embedding(num_positions, embedding_dim, padding_idx) + if not hasattr(self, "weight"): + # in ___init__ + super().__init__(num_positions, embedding_dim, padding_idx, _weight=weight) + else: + # in forward put the weights on the correct dtype and device of the param + weight = weight.to(dtype=self.weight.dtype, device=self.weight.device) + self.weight = nn.Parameter(weight) + self.weight.detach_() + self.weight.requires_grad = False + + @staticmethod + def get_embedding(num_embeddings, embedding_dim, padding_idx): + """ + Build sinusoidal embeddings. + This matches the implementation in tensor2tensor, + but differs slightly from the description in Section 3.5 of + "Attention Is All You Need". + """ + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) + emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze( + 1 + ) * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view( + num_embeddings, -1 + ) + if embedding_dim % 2 == 1: + # zero pad + emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) + if padding_idx is not None: + emb[padding_idx, :] = 0 + return emb + + @staticmethod + def make_positions(tensor, padding_idx: int): + """ + Replace non-padding symbols with their position numbers. + Position numbers begin at padding_idx+1. Padding symbols are ignored. + """ + # The series of casts and type-conversions here are carefully + # balanced to both work with ONNX export and XLA. In particular XLA + # prefers ints, cumsum defaults to output longs, and ONNX doesn't know + # how to handle the dtype kwarg in cumsum. + mask = tensor.ne(padding_idx).int() + return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx + + def forward( + self, + input, + incremental_state: Optional[Any] = None, + timestep: Optional[Tensor] = None, + ): + """Input is expected to be of size [bsz x seqlen].""" + bsz, seq_len = input.shape[:2] + max_pos = self.padding_idx + 1 + seq_len + + if max_pos > self.weight.size(0): + # expand embeddings if needed + self.make_weight(max_pos, self.embedding_dim, self.padding_idx) + + if incremental_state is not None: + # positions is the same for every token when decoding a single step + pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len + return self.weight[self.padding_idx + pos, :].expand(bsz, 1, -1) + + positions = self.make_positions(input, self.padding_idx) + # `super().forward` is + # (self.weight.index_select(0, positions.view(-1)) + # .view(bsz, seq_len, -1).detach()) + return super().forward(positions) diff --git a/src/advanced_transformers/transformers/components/residual_connections/__init__.py b/src/advanced_transformers/transformers/components/residual_connections/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tokenization.py b/tokenization.py deleted file mode 100644 index 2e75e3a..0000000 --- a/tokenization.py +++ /dev/null @@ -1,312 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -# 형태소분석 기반 BERT를 위한 Tokenization Class -# 수정: joonho.lim -# 일자: 2019-05-23 -# -# 주석 및 새롭게 코드 수정 -# 작성자: MyungHoon Jin - -import collections -import re -import unicodedata -import six -import tensorflow as tf - -def convert_to_unicode(text): - # Python version이 3.x일 때, - # type(text)이 `bytes`일 경우, utf-8로 변환 - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - # Python version이 2.x일 때, - # type(text)이 `str`일 경우, utf-8로 변환 - elif six.PY2: - if isinstance(text, str): - return text.decode("utf-8", "ignore") - elif isinstance(text, unicode): - return text - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - # Python 3.x, 2.x만 허용! - else: - raise ValueError("Not running on Python2 or Python 3?") - -def printable_text(text): - if six.PY3: - if isinstance(text, str): - return text - elif isinstance(text, bytes): - return text.decode("utf-8", "ignore") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - elif six.PY2: - if isinstance(text, str): - return text - elif isinstance(text, unicode): - return text.encode("utf-8") - else: - raise ValueError("Unsupported string type: %s" % (type(text))) - else: - raise ValueError("Not running on Python2 or Python 3?") - -class BERTTokenizer: - """End 2 End Tokenizing NLU Embedding!""" - # from_pretrained method는 향후 추가! - def __init__(self, vocab_file, do_lower_case=False, max_len=None): - # ETRI에서 제공한 vocab file을 읽어오고 - # 역 방향의 사전을 정의한다. - self.vocab = self._load_vocab(vocab_file) - self.inv_vocab = {v: k for k, v in self.vocab.items()} - # End to End Tokenizer를 구축하기 위해 아래 두 Tokenizer를 할당한다. - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - self.max_len = max_len if max_len is not None else int(1e12) - - def tokenize(self, text): - split_tokens = [] - # End to End Tokenizing. - for token in self.basic_tokenizer.tokenize(text): - # ETRI Vocab 양식에 맞게 token 끝에 '_'를 붙여준다. - token += '_' - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - return split_tokens - - def convert_tokens_to_ids(self, tokens): - ids = _convert_by_vocab(self.vocab, tokens) - if len(ids) > self.max_len: - raise ValueError( - "Token indices sequence length is longer than the specified maximum " - " sequence length for this BERT model ({} > {}). Running this" - " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)) - return ids - - def convert_ids_to_tokens(self, ids): - return _convert_by_vocab(self.inv_vocab, ids) - - @staticmethod - def _load_vocab(vocab_file): - # 단어 사전을 저장할 OrderedDict 객체 생성 - vocab = collections.OrderedDict() - index = 0 - with tf.io.gfile.GFile(vocab_file, 'r') as reader: - while True: - # Binary Text를 unicode(utf-8)로 decode. - token = convert_to_unicode(reader.readline()) - if not token: break - if ((token.find('n_iters=') == 0) or - (token.find('max_length=') == 0)): - continue - token = token.split('\t')[0] - token = token.strip() - # 토큰과 해당 index를 기록 - vocab[token] = index - index += 1 - return vocab - - @staticmethod - def _convert_by_vocab(vocab, items): - """Converts a sequence of [tokens|ids] using the vocab.""" - output = [] - for item in items: - output.append(vocab[item]) - return output - -class BasicTokenizer: - - def __init__(self, do_lower_case=True): - self.do_lower_case = do_lower_case - - def tokenize(self, text): - text = convert_to_unicode(text) - text = self._clean_text(text) - - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case: - # 현재 input으로 '고객/NNG'와 같이 Part-of-speech가 이미 - # tagging되어있고 vocab은 '고객/NNG_'로 단어를 기록하고 있음. - # 여기서 `lower` 메서드를 사용하면 뒤의 tagging이 소문자로 - # 변환되어 값의 비교를 못하게 되므로 이를 주석처리. - - # token.lower() - - # 모든 음절을 정준 분해시키는 함수 - token = self._run_strip_accents(token) - # whitespacing이랑 다를게 무엇인지? - split_tokens.extend(self._run_split_on_punc(token)) - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, token): - """Strips accents from a piece of text.""" - token = unicodedata.normalize("NFD", token) - # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d - # 모든 음절을 정준 분해(Canonical Decomposition)시킴 - # '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일) - output = [] - for char in token: - cat = unicodedata.category(char) - if cat == "Mn": - # unicode category가 "Mark, Nonspacing"일 경우 pass - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, token): - """Splits punctuation on a piece of text.""" - chars = list(token) - i, start_new_word = 0, True - output = [] - while i < len(chars): - char = chars[i] - if self._is_punctuation(char): - # 공백이면 [" "]을 추가하고 새로운 단어로 시작 - output.append([char]) - start_new_word = True - else: - # 공백이 아닐 경우, - if start_new_word: - # 새로운 단어로 시작할 경우에 빈 리스트 추가 - output.append([]) - # 해당 문자부터 시작하도록 start_new_word는 False로 setting. - start_new_word = False - # 위에 추가한 빈 리스트에 각각 character를 채워넣음 - output[-1].append(char) - i += 1 - return ["".join(x) for x in output] - - - def _clean_text(self, text): - output = [] # char을 저장할 list 생성 - for char in text: - # 텍스트에서 Char 단위로 출력 - cp = ord(char) - if cp == 0 or cp == 0xfffd or self._is_control(char): - # \x00이거나 �이거나 unicode cat.이 C로 시작할 경우 - # (개행문자 제외) output에 추가하지 않는다. - continue - if self._is_whitespace(char): - # 공백일 경우 " "으로 output에 추가 - output.append(" ") - else: - # 이 외의 경우 전부 output에 추가 - output.append(char) - # cleaning 작업을 거친 Text를 후처리하여 반환 - return "".join(output) - - # char 단위 함수들 - @staticmethod - def _is_whitespace(char): - if char == " " or char == '\t' or char == '\n' or char == '\r': - # 개행문자이거나 띄어쓰기면 True 반환 - return True - cat = unicodedata.category(char) - if cat == 'Zs': - # unicode category가 Space Seperator면 True 반환 - # https://www.compart.com/en/unicode/category/Zs - return True - # 이 외의 경우 전부 False 반환 - return False - - @staticmethod - def _is_control(char): - if char == "\t" or char == "\n" or char == "\r": - # 개행문자이면 False 반환 - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - # unicode category가 - # Cc(Control) - # Cf(format) - # Co(Private Use, is 0) - # Cs(Surrrogate, is 0)일 경우, True 반환 - # https://en.wikipedia.org/wiki/Control_character - return True - # 이 외의 경우 전부 False 반환 - return False - - @staticmethod - def _is_punctuation(char): - # 한국어 형태소 분석기이기 때문에 공백과 같은지 여부만 반환 - return char == ' ' - -class WordpieceTokenizer: - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """ - This uses a greedy longest-match-first algorithm to perform - tokenization using the given vocabulary. - """ - text = convert_to_unicode(text) - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - # max word로 설정한 글자 수를 넘길 경우, UNK 처리 - output_tokens.append(self.unk_token) - continue - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - # 첫번째 글자부터 천천히 vocab에 있는 단어인지 체크 - while start < end: - substr = "".join(chars[start:end]) - # do_lower_case == True일 경우에 - # 위에서 Canonical Decomposition 과정을 거쳤기 때문에 - # 이를 다시 Composition해줘야 vocab의 단어와 비교 가능하다. - substr = unicodedata.normalize("NFC", substr) - if substr in self.vocab: - # 만일 해당 단어가 vocab에 있다면 해당 단어로 break - cur_substr = substr - break - end -= 1 - # 만일 어떠한 단어랑도 매칭되지 않았다면, (1)로 가서 [UNK] 처리 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - # 어미, 혹은 다른 사전에 있는 단어를 찾기위해 start에 end값을 할당 - start = end - if is_bad: # --- (1) - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - -# text 단위 공백 처리 -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() # 양 사이드의 공백을 제거 - if not text: # 어떠한 값도 없을 시, 빈 list를 반환 - return [] - tokens = text.split() # 공백 단위로 쪼갠 list를 반환 - return tokens diff --git a/torch_bert/README.md b/torch_bert/README.md deleted file mode 100644 index 663bfe5..0000000 --- a/torch_bert/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# ETRI Pytorch version BERT code - -#### 20.04.20 (월) -- `huggingface.tokenizers`는 `Rust`로 작성 -- 때문에 Etri에서 제공한 Wordpiece Tokenizer는 직접 구현한 것으로 추정됨 -- 아니면 rust code를 python으로 포팅했거나 -- 혹은 tensorflow에서 사용한 version의 코드이거나 -- 아니네 이미 있네! fast version이냐 python이냐 차인가? 살펴보자 -- 추가적인 코드 작성할 필요 있음!! 없는 token! -- GeLU 구현 중 ERF 함수 - $\mathrm{erf}(x) = \frac{2}{\sqrt{\pi}} \int_{0}^{x} e^{-t^2} dt$ diff --git a/torch_bert/__init__.py b/torch_bert/__init__.py deleted file mode 100644 index 57825ab..0000000 --- a/torch_bert/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__all__ = ['tokenization_bert', 'configuration_bert', 'tokenization_utils'] diff --git a/torch_bert/__pycache__/file_utils.cpython-36.pyc b/torch_bert/__pycache__/file_utils.cpython-36.pyc deleted file mode 100644 index 4c30c04..0000000 Binary files a/torch_bert/__pycache__/file_utils.cpython-36.pyc and /dev/null differ diff --git a/torch_bert/__pycache__/tokenization_bert.cpython-36.pyc b/torch_bert/__pycache__/tokenization_bert.cpython-36.pyc deleted file mode 100644 index 7ea8396..0000000 Binary files a/torch_bert/__pycache__/tokenization_bert.cpython-36.pyc and /dev/null differ diff --git a/torch_bert/__pycache__/tokenization_utils.cpython-36.pyc b/torch_bert/__pycache__/tokenization_utils.cpython-36.pyc deleted file mode 100644 index 7fc7993..0000000 Binary files a/torch_bert/__pycache__/tokenization_utils.cpython-36.pyc and /dev/null differ diff --git a/torch_bert/activations.py b/torch_bert/activations.py deleted file mode 100644 index b95fc05..0000000 --- a/torch_bert/activations.py +++ /dev/null @@ -1,64 +0,0 @@ -# https://subinium.github.io/introduction-to-activation/ - -import logging -import math - -import torch -import torch.nn.functional as F - -logger = logging.getLogger(__name__) - - -def swish(x): - """https://arxiv.org/pdf/1710.05941v1.pdf""" - return x * torch.sigmoid(x) - - -def _gelu_python(x): - """ Original Implementation of the gelu activation function in Google Bert repo when initially created. - For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): - 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - This is now written in C in torch.nn.functional - Also see https://arxiv.org/abs/1606.08415 - """ - # torch.erf(input, out=None) -> Tensor - # Computes the error function of each element. - return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) - - -def gelu_new(x): - """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). - Also see https://arxiv.org/abs/1606.08415 - """ - return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - - -if torch.__version__ < "1.4.0": - gelu = _gelu_python -else: - gelu = F.gelu # 얘가 제일 빠름! - try: - import torch_xla - - logger.warning( - "The torch_xla package was detected in the python environment. PyTorch/XLA and JIT is untested," - " no activation function will be traced with JIT." - ) - except ImportError: - gelu_new = torch.jit.script(gelu_new) - -ACT2FN = { - 'relu': F.relu, - 'swish': swish, - 'gelu': gelu, - 'tanh': torch.tanh, - 'gelu_new': gelu_new -} - - -def get_activation(activation_string): - activation = ACT2FN.get(activation_string, None) - if activation is None: - raise KeyError(f"function {activation_string} not found " - "in ACT2FN mapping {list(ACT2FN.keys())}") - return activation diff --git a/torch_bert/configuration_bert.py b/torch_bert/configuration_bert.py deleted file mode 100644 index 51e89a8..0000000 --- a/torch_bert/configuration_bert.py +++ /dev/null @@ -1,518 +0,0 @@ - -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" BERT model configuration """ - -import copy -import json -import logging -import os -from typing import Dict, Optional, Tuple - -from file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url - -logger = logging.getLogger(__name__) - -BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", - "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", - "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", - "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", - "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", - "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", - "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", - "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", - "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", - "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", - "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", - "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", - "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", - "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", - "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", - "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json", - "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", - "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", - "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json", - "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json", - "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json", - "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json", -} - -class PretrainedConfig(object): - r""" Base class for all configuration classes. - Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. - Note: - A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. - It only affects the model's configuration. - Class attributes (overridden by derived classes): - - ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values. - - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`. - Args: - finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`): - Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. - num_labels (:obj:`int`, `optional`, defaults to `2`): - Number of classes to use when the model is a classification model (sequences/tokens) - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`): - Should the model returns attentions weights. - output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`): - Should the model returns all hidden-states. - torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`): - Is the model used with Torchscript (for PyTorch models). - """ - pretrained_config_archive_map = {} # type: Dict[str, str] - model_type = "" # type: str - - def __init__(self, **kwargs): - # Attributes with defaults - self.output_attentions = kwargs.pop("output_attentions", False) - self.output_hidden_states = kwargs.pop("output_hidden_states", False) - self.use_cache = kwargs.pop("use_cache", True) # Not used by all models - self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models - self.use_bfloat16 = kwargs.pop("use_bfloat16", False) - self.pruned_heads = kwargs.pop("pruned_heads", {}) - - # Is decoder is used in encoder-decoder models to differentiate encoder from decoder - self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False) - self.is_decoder = kwargs.pop("is_decoder", False) - - # Parameters for sequence generation - self.max_length = kwargs.pop("max_length", 20) - self.min_length = kwargs.pop("min_length", 0) - self.do_sample = kwargs.pop("do_sample", False) - self.early_stopping = kwargs.pop("early_stopping", False) - self.num_beams = kwargs.pop("num_beams", 1) - self.temperature = kwargs.pop("temperature", 1.0) - self.top_k = kwargs.pop("top_k", 50) - self.top_p = kwargs.pop("top_p", 1.0) - self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) - self.length_penalty = kwargs.pop("length_penalty", 1.0) - self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0) - self.bad_words_ids = kwargs.pop("bad_words_ids", None) - self.num_return_sequences = kwargs.pop("num_return_sequences", 1) - - # Fine-tuning task arguments - self.architectures = kwargs.pop("architectures", None) - self.finetuning_task = kwargs.pop("finetuning_task", None) - self.num_labels = kwargs.pop("num_labels", 2) - self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)}) - self.id2label = dict((int(key), value) for key, value in self.id2label.items()) - self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys()))) - self.label2id = dict((key, int(value)) for key, value in self.label2id.items()) - - # Tokenizer arguments TODO: eventually tokenizer and models should share the same config - self.prefix = kwargs.pop("prefix", None) - self.bos_token_id = kwargs.pop("bos_token_id", None) - self.pad_token_id = kwargs.pop("pad_token_id", None) - self.eos_token_id = kwargs.pop("eos_token_id", None) - self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) - - # task specific arguments - self.task_specific_params = kwargs.pop("task_specific_params", None) - - # TPU arguments - self.xla_device = kwargs.pop("xla_device", None) - - # Additional attributes without default values - for key, value in kwargs.items(): - try: - setattr(self, key, value) - except AttributeError as err: - logger.error("Can't set {} with value {} for {}".format(key, value, self)) - raise err - - @property - def num_labels(self): - return self._num_labels - - @num_labels.setter - def num_labels(self, num_labels): - self._num_labels = num_labels - self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)} - self.id2label = dict((int(key), value) for key, value in self.id2label.items()) - self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) - self.label2id = dict((key, int(value)) for key, value in self.label2id.items()) - - def save_pretrained(self, save_directory): - """ - Save a configuration object to the directory `save_directory`, so that it - can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. - Args: - save_directory (:obj:`string`): - Directory where the configuration JSON file will be saved. - """ - assert os.path.isdir( - save_directory - ), "Saving path should be a directory where the model and configuration can be saved" - - # If we save using the predefined names, we can load using `from_pretrained` - output_config_file = os.path.join(save_directory, CONFIG_NAME) - - self.to_json_file(output_config_file, use_diff=True) - logger.info("Configuration saved in {}".format(output_config_file)) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig": - r""" - Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration. - Args: - pretrained_model_name_or_path (:obj:`string`): - either: - - a string with the `shortcut name` of a pre-trained model configuration to load from cache or - download, e.g.: ``bert-base-uncased``. - - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to - our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing a configuration file saved using the - :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - - a path or url to a saved configuration JSON `file`, e.g.: - ``./my_model_directory/configuration.json``. - cache_dir (:obj:`string`, `optional`): - Path to a directory in which a downloaded pre-trained model - configuration should be cached if the standard cache should not be used. - kwargs (:obj:`Dict[str, any]`, `optional`): - The values in kwargs of any keys which are configuration attributes will be used to override the loaded - values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is - controlled by the `return_unused_kwargs` keyword parameter. - force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): - Force to (re-)download the model weights and configuration files and override the cached versions if they exist. - resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): - Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. - proxies (:obj:`Dict`, `optional`): - A dictionary of proxy servers to use by protocol or endpoint, e.g.: - :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` - The proxies are used on each request. - return_unused_kwargs: (`optional`) bool: - If False, then this function returns just the final configuration object. - If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a - dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part - of kwargs which has not been used to update `config` and is otherwise ignored. - Returns: - :class:`PretrainedConfig`: An instance of a configuration object - Examples:: - # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a - # derived class: BertConfig - config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. - config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` - config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') - config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) - assert config.output_attention == True - config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, - foo=False, return_unused_kwargs=True) - assert config.output_attention == True - assert unused_kwargs == {'foo': False} - """ - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - return cls.from_dict(config_dict, **kwargs) - - @classmethod - def get_config_dict( - cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs - ) -> Tuple[Dict, Dict]: - """ - From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used - for instantiating a Config using `from_dict`. - Parameters: - pretrained_model_name_or_path (:obj:`string`): - The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. - pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict: - A map of `shortcut names` to `url`. By default, will use the current class attribute. - Returns: - :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object. - """ - cache_dir = kwargs.pop("cache_dir", None) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - local_files_only = kwargs.pop("local_files_only", False) - - if pretrained_config_archive_map is None: - pretrained_config_archive_map = cls.pretrained_config_archive_map - - if pretrained_model_name_or_path in pretrained_config_archive_map: - config_file = pretrained_config_archive_map[pretrained_model_name_or_path] - elif os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) - elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): - config_file = pretrained_model_name_or_path - else: - config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME) - - try: - # Load from URL or cache if already cached - resolved_config_file = cached_path( - config_file, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - ) - # Load config dict - if resolved_config_file is None: - raise EnvironmentError - config_dict = cls._dict_from_json_file(resolved_config_file) - - except EnvironmentError: - if pretrained_model_name_or_path in pretrained_config_archive_map: - msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( - config_file - ) - else: - msg = ( - "Can't load '{}'. Make sure that:\n\n" - "- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" - "- or '{}' is the correct path to a directory containing a '{}' file\n\n".format( - pretrained_model_name_or_path, - pretrained_model_name_or_path, - pretrained_model_name_or_path, - CONFIG_NAME, - ) - ) - raise EnvironmentError(msg) - - except json.JSONDecodeError: - msg = ( - "Couldn't reach server at '{}' to download configuration file or " - "configuration file is not a valid JSON file. " - "Please check network or file content here: {}.".format(config_file, resolved_config_file) - ) - raise EnvironmentError(msg) - - if resolved_config_file == config_file: - logger.info("loading configuration file {}".format(config_file)) - else: - logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file)) - - return config_dict, kwargs - - @classmethod - def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig": - """ - Constructs a `Config` from a Python dictionary of parameters. - Args: - config_dict (:obj:`Dict[str, any]`): - Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved - from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict` - method. - kwargs (:obj:`Dict[str, any]`): - Additional parameters from which to initialize the configuration object. - Returns: - :class:`PretrainedConfig`: An instance of a configuration object - """ - return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) - - config = cls(**config_dict) - - if hasattr(config, "pruned_heads"): - config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) - - # Update config with kwargs if needed - to_remove = [] - for key, value in kwargs.items(): - if hasattr(config, key): - setattr(config, key, value) - to_remove.append(key) - for key in to_remove: - kwargs.pop(key, None) - - logger.info("Model config %s", str(config)) - if return_unused_kwargs: - return config, kwargs - else: - return config - - @classmethod - def from_json_file(cls, json_file: str) -> "PretrainedConfig": - """ - Constructs a `Config` from the path to a json file of parameters. - Args: - json_file (:obj:`string`): - Path to the JSON file containing the parameters. - Returns: - :class:`PretrainedConfig`: An instance of a configuration object - """ - config_dict = cls._dict_from_json_file(json_file) - return cls(**config_dict) - - @classmethod - def _dict_from_json_file(cls, json_file: str): - with open(json_file, "r", encoding="utf-8") as reader: - text = reader.read() - return json.loads(text) - - def __eq__(self, other): - return self.__dict__ == other.__dict__ - - def __repr__(self): - return "{} {}".format(self.__class__.__name__, self.to_json_string()) - - def to_diff_dict(self): - """ - Removes all attributes from config which correspond to the default - config attributes for better readability and serializes to a Python - dictionary. - Returns: - :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - config_dict = self.to_dict() - - # get the default config dict - default_config_dict = PretrainedConfig().to_dict() - - serializable_config_dict = {} - - # only serialize values that differ from the default config - for key, value in config_dict.items(): - if key not in default_config_dict or value != default_config_dict[key]: - serializable_config_dict[key] = value - - return serializable_config_dict - - def to_dict(self): - """ - Serializes this instance to a Python dictionary. - Returns: - :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, - """ - output = copy.deepcopy(self.__dict__) - if hasattr(self.__class__, "model_type"): - output["model_type"] = self.__class__.model_type - return output - - def to_json_string(self, use_diff=True): - """ - Serializes this instance to a JSON string. - Args: - use_diff (:obj:`bool`): - If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON string. - Returns: - :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format. - """ - if use_diff is True: - config_dict = self.to_diff_dict() - else: - config_dict = self.to_dict() - return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" - - def to_json_file(self, json_file_path, use_diff=True): - """ - Save this instance to a json file. - Args: - json_file_path (:obj:`string`): - Path to the JSON file in which this configuration instance's parameters will be saved. - use_diff (:obj:`bool`): - If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON file. - """ - with open(json_file_path, "w", encoding="utf-8") as writer: - writer.write(self.to_json_string(use_diff=use_diff)) - - def update(self, config_dict: Dict): - """ - Updates attributes of this class - with attributes from `config_dict`. - Args: - :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class. - """ - for key, value in config_dict.items(): - setattr(self, key, value) - - -class BertConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a :class:`~transformers.BertModel`. - It is used to instantiate an BERT model according to the specified arguments, defining the model - architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of - the BERT `bert-base-uncased `__ architecture. - Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used - to control the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` - for more information. - Args: - vocab_size (:obj:`int`, optional, defaults to 30522): - Vocabulary size of the BERT model. Defines the different tokens that - can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`. - hidden_size (:obj:`int`, optional, defaults to 768): - Dimensionality of the encoder layers and the pooler layer. - num_hidden_layers (:obj:`int`, optional, defaults to 12): - Number of hidden layers in the Transformer encoder. - num_attention_heads (:obj:`int`, optional, defaults to 12): - Number of attention heads for each attention layer in the Transformer encoder. - intermediate_size (:obj:`int`, optional, defaults to 3072): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): - The non-linear activation function (function or string) in the encoder and pooler. - If string, "gelu", "relu", "swish" and "gelu_new" are supported. - hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): - The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): - The dropout ratio for the attention probabilities. - max_position_embeddings (:obj:`int`, optional, defaults to 512): - The maximum sequence length that this model might ever be used with. - Typically set this to something large just in case (e.g., 512 or 1024 or 2048). - type_vocab_size (:obj:`int`, optional, defaults to 2): - The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. - initializer_range (:obj:`float`, optional, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): - The epsilon used by the layer normalization layers. - Example:: - from transformers import BertModel, BertConfig - # Initializing a BERT bert-base-uncased style configuration - configuration = BertConfig() - # Initializing a model from the bert-base-uncased style configuration - model = BertModel(configuration) - # Accessing the model configuration - configuration = model.config - Attributes: - pretrained_config_archive_map (Dict[str, str]): - A dictionary containing all the available pre-trained checkpoints. - """ - pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP - model_type = "bert" - - def __init__( - self, - vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - layer_norm_eps=1e-12, - pad_token_id=0, - **kwargs - ): - super().__init__(pad_token_id=pad_token_id, **kwargs) - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - - -if __name__ == '__main__': - bergconfig = BertConfig() diff --git a/torch_bert/file_utils.py b/torch_bert/file_utils.py deleted file mode 100644 index 159e81a..0000000 --- a/torch_bert/file_utils.py +++ /dev/null @@ -1,496 +0,0 @@ -""" -Utilities for working with the local dataset cache. -This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp -Copyright by the AllenNLP authors. -""" - -import fnmatch -import json -import logging -import os -import shutil -import sys -import tarfile -import tempfile -from contextlib import contextmanager -from functools import partial, wraps -from hashlib import sha256 -from typing import Optional -from urllib.parse import urlparse -from zipfile import ZipFile, is_zipfile - -import boto3 -import requests -from botocore.config import Config -from botocore.exceptions import ClientError -from filelock import FileLock -from tqdm.auto import tqdm - -# from . import __version__ - - -logger = logging.getLogger(__name__) # pylint: disable=invalid-name - -try: - USE_TF = os.environ.get("USE_TF", "AUTO").upper() - USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() - if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"): - import torch - - _torch_available = True # pylint: disable=invalid-name - logger.info("PyTorch version {} available.".format(torch.__version__)) - else: - logger.info("Disabling PyTorch because USE_TF is set") - _torch_available = False -except ImportError: - _torch_available = False # pylint: disable=invalid-name - -# TensorFlow 안써용 -# try: -# USE_TF = os.environ.get("USE_TF", "AUTO").upper() -# USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() -# -# if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"): -# import tensorflow as tf -# -# assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2 -# _tf_available = True # pylint: disable=invalid-name -# logger.info("TensorFlow version {} available.".format(tf.__version__)) -# else: -# logger.info("Disabling Tensorflow because USE_TORCH is set") -# _tf_available = False -# except (ImportError, AssertionError): -# _tf_available = False # pylint: disable=invalid-name - -try: - from torch.hub import _get_torch_home - - torch_cache_home = _get_torch_home() -except ImportError: - torch_cache_home = os.path.expanduser( - os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch")) - ) -default_cache_path = os.path.join(torch_cache_home, "transformers") - -try: - from pathlib import Path - - PYTORCH_PRETRAINED_BERT_CACHE = Path( - os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)) - ) -except (AttributeError, ImportError): - PYTORCH_PRETRAINED_BERT_CACHE = os.getenv( - "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path) - ) - -PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility -TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility - -WEIGHTS_NAME = "pytorch_model.bin" -TF2_WEIGHTS_NAME = "tf_model.h5" -TF_WEIGHTS_NAME = "model.ckpt" -CONFIG_NAME = "config.json" -MODEL_CARD_NAME = "modelcard.json" - - -MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]] -DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] -DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] - -S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" -CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net" - - -def is_torch_available(): - return _torch_available - - -def is_tf_available(): - return _tf_available - - -def add_start_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator - - -def add_start_docstrings_to_callable(*docstr): - def docstring_decorator(fn): - class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0]) - intro = " The {} forward method, overrides the :func:`__call__` special method.".format(class_name) - note = r""" - .. note:: - Although the recipe for forward pass needs to be defined within - this function, one should call the :class:`Module` instance afterwards - instead of this since the former takes care of running the - pre and post processing steps while the latter silently ignores them. - """ - fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") - return fn - - return docstring_decorator - - -def add_end_docstrings(*docstr): - def docstring_decorator(fn): - fn.__doc__ = fn.__doc__ + "".join(docstr) - return fn - - return docstring_decorator - - -def is_remote_url(url_or_filename): - parsed = urlparse(url_or_filename) - return parsed.scheme in ("http", "https", "s3") - - -def hf_bucket_url(identifier, postfix=None, cdn=False) -> str: - endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX - if postfix is None: - return "/".join((endpoint, identifier)) - else: - return "/".join((endpoint, identifier, postfix)) - - -def url_to_filename(url, etag=None): - """ - Convert `url` into a hashed filename in a repeatable way. - If `etag` is specified, append its hash to the url's, delimited - by a period. - If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name - so that TF 2.0 can identify it as a HDF5 file - (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) - """ - url_bytes = url.encode("utf-8") - url_hash = sha256(url_bytes) - filename = url_hash.hexdigest() - - if etag: - etag_bytes = etag.encode("utf-8") - etag_hash = sha256(etag_bytes) - filename += "." + etag_hash.hexdigest() - - if url.endswith(".h5"): - filename += ".h5" - - return filename - - -def filename_to_url(filename, cache_dir=None): - """ - Return the url and etag (which may be ``None``) stored for `filename`. - Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. - """ - if cache_dir is None: - cache_dir = TRANSFORMERS_CACHE - if isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - - cache_path = os.path.join(cache_dir, filename) - if not os.path.exists(cache_path): - raise EnvironmentError("file {} not found".format(cache_path)) - - meta_path = cache_path + ".json" - if not os.path.exists(meta_path): - raise EnvironmentError("file {} not found".format(meta_path)) - - with open(meta_path, encoding="utf-8") as meta_file: - metadata = json.load(meta_file) - url = metadata["url"] - etag = metadata["etag"] - - return url, etag - - -def cached_path( - url_or_filename, - cache_dir=None, - force_download=False, - proxies=None, - resume_download=False, - user_agent=None, - extract_compressed_file=False, - force_extract=False, - local_files_only=False, -) -> Optional[str]: - """ - Given something that might be a URL (or might be a local path), - determine which. If it's a URL, download the file and cache it, and - return the path to the cached file. If it's already a local path, - make sure the file exists and then return the path. - Args: - cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). - force_download: if True, re-dowload the file even if it's already cached in the cache dir. - resume_download: if True, resume the download if incompletly recieved file is found. - user_agent: Optional string or dict that will be appended to the user-agent on remote requests. - extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed - file in a folder along the archive. - force_extract: if True when extract_compressed_file is True and the archive was already extracted, - re-extract the archive and overide the folder where it was extracted. - Return: - None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). - Local path (string) otherwise - """ - if cache_dir is None: - cache_dir = TRANSFORMERS_CACHE - if isinstance(url_or_filename, Path): - url_or_filename = str(url_or_filename) - if isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - - if is_remote_url(url_or_filename): - # URL, so get it from the cache (downloading if necessary) - output_path = get_from_cache( - url_or_filename, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - user_agent=user_agent, - local_files_only=local_files_only, - ) - elif os.path.exists(url_or_filename): - # File, and it exists. - output_path = url_or_filename - elif urlparse(url_or_filename).scheme == "": - # File, but it doesn't exist. - raise EnvironmentError("file {} not found".format(url_or_filename)) - else: - # Something unknown - raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) - - if extract_compressed_file: - if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): - return output_path - - # Path where we extract compressed archives - # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" - output_dir, output_file = os.path.split(output_path) - output_extract_dir_name = output_file.replace(".", "-") + "-extracted" - output_path_extracted = os.path.join(output_dir, output_extract_dir_name) - - if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract: - return output_path_extracted - - # Prevent parallel extractions - lock_path = output_path + ".lock" - with FileLock(lock_path): - shutil.rmtree(output_path_extracted, ignore_errors=True) - os.makedirs(output_path_extracted) - if is_zipfile(output_path): - with ZipFile(output_path, "r") as zip_file: - zip_file.extractall(output_path_extracted) - zip_file.close() - elif tarfile.is_tarfile(output_path): - tar_file = tarfile.open(output_path) - tar_file.extractall(output_path_extracted) - tar_file.close() - else: - raise EnvironmentError("Archive format of {} could not be identified".format(output_path)) - - return output_path_extracted - - return output_path - - -def split_s3_path(url): - """Split a full s3 path into the bucket name and path.""" - parsed = urlparse(url) - if not parsed.netloc or not parsed.path: - raise ValueError("bad s3 path {}".format(url)) - bucket_name = parsed.netloc - s3_path = parsed.path - # Remove '/' at beginning of path. - if s3_path.startswith("/"): - s3_path = s3_path[1:] - return bucket_name, s3_path - - -def s3_request(func): - """ - Wrapper function for s3 requests in order to create more helpful error - messages. - """ - - @wraps(func) - def wrapper(url, *args, **kwargs): - try: - return func(url, *args, **kwargs) - except ClientError as exc: - if int(exc.response["Error"]["Code"]) == 404: - raise EnvironmentError("file {} not found".format(url)) - else: - raise - - return wrapper - - -@s3_request -def s3_etag(url, proxies=None): - """Check ETag on S3 object.""" - s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) - bucket_name, s3_path = split_s3_path(url) - s3_object = s3_resource.Object(bucket_name, s3_path) - return s3_object.e_tag - - -@s3_request -def s3_get(url, temp_file, proxies=None): - """Pull a file directly from S3.""" - s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) - bucket_name, s3_path = split_s3_path(url) - s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) - - -def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): - ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0]) - if is_torch_available(): - ua += "; torch/{}".format(torch.__version__) - if is_tf_available(): - ua += "; tensorflow/{}".format(tf.__version__) - if isinstance(user_agent, dict): - ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items()) - elif isinstance(user_agent, str): - ua += "; " + user_agent - headers = {"user-agent": ua} - if resume_size > 0: - headers["Range"] = "bytes=%d-" % (resume_size,) - response = requests.get(url, stream=True, proxies=proxies, headers=headers) - if response.status_code == 416: # Range not satisfiable - return - content_length = response.headers.get("Content-Length") - total = resume_size + int(content_length) if content_length is not None else None - progress = tqdm( - unit="B", - unit_scale=True, - total=total, - initial=resume_size, - desc="Downloading", - disable=bool(logger.getEffectiveLevel() == logging.NOTSET), - ) - for chunk in response.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - progress.update(len(chunk)) - temp_file.write(chunk) - progress.close() - - -def get_from_cache( - url, - cache_dir=None, - force_download=False, - proxies=None, - etag_timeout=10, - resume_download=False, - user_agent=None, - local_files_only=False, -) -> Optional[str]: - """ - Given a URL, look for the corresponding file in the local cache. - If it's not there, download it. Then return the path to the cached file. - Return: - None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk). - Local path (string) otherwise - """ - if cache_dir is None: - cache_dir = TRANSFORMERS_CACHE - if isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - - os.makedirs(cache_dir, exist_ok=True) - - etag = None - if not local_files_only: - # Get eTag to add to filename, if it exists. - if url.startswith("s3://"): - etag = s3_etag(url, proxies=proxies) - else: - try: - response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout) - if response.status_code == 200: - etag = response.headers.get("ETag") - except (EnvironmentError, requests.exceptions.Timeout): - # etag is already None - pass - - filename = url_to_filename(url, etag) - - # get cache path to put the file - cache_path = os.path.join(cache_dir, filename) - - # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible. - # try to get the last downloaded one - if etag is None: - if os.path.exists(cache_path): - return cache_path - else: - matching_files = [ - file - for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*") - if not file.endswith(".json") and not file.endswith(".lock") - ] - if len(matching_files) > 0: - return os.path.join(cache_dir, matching_files[-1]) - else: - # If files cannot be found and local_files_only=True, - # the models might've been found if local_files_only=False - # Notify the user about that - if local_files_only: - raise ValueError( - "Cannot find the requested files in the cached path and outgoing traffic has been" - " disabled. To enable model look-ups and downloads online, set 'local_files_only'" - " to False." - ) - return None - - # From now on, etag is not None. - if os.path.exists(cache_path) and not force_download: - return cache_path - - # Prevent parallel downloads of the same file with a lock. - lock_path = cache_path + ".lock" - with FileLock(lock_path): - - if resume_download: - incomplete_path = cache_path + ".incomplete" - - @contextmanager - def _resumable_file_manager(): - with open(incomplete_path, "a+b") as f: - yield f - - temp_file_manager = _resumable_file_manager - if os.path.exists(incomplete_path): - resume_size = os.stat(incomplete_path).st_size - else: - resume_size = 0 - else: - temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) - resume_size = 0 - - # Download to temporary file, then copy to cache dir once finished. - # Otherwise you get corrupt cache entries if the download gets interrupted. - with temp_file_manager() as temp_file: - logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) - - # GET file object - if url.startswith("s3://"): - if resume_download: - logger.warn('Warning: resumable downloads are not implemented for "s3://" urls') - s3_get(url, temp_file, proxies=proxies) - else: - http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) - - logger.info("storing %s in cache at %s", url, cache_path) - os.replace(temp_file.name, cache_path) - - logger.info("creating metadata file for %s", cache_path) - meta = {"url": url, "etag": etag} - meta_path = cache_path + ".json" - with open(meta_path, "w") as meta_file: - json.dump(meta, meta_file) - - return cache_path diff --git a/torch_bert/modeling_bert.py b/torch_bert/modeling_bert.py deleted file mode 100644 index 4a74b72..0000000 --- a/torch_bert/modeling_bert.py +++ /dev/null @@ -1,1369 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""PyTorch BERT model. """ - - -import logging -import math -import os - -import torch -from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss - -from .activations import gelu, gelu_new, swish -from .configuration_bert import BertConfig -from .file_utils import add_start_docstrings, add_start_docstrings_to_callable -from .modeling_utils import PreTrainedModel, prune_linear_layer - - -logger = logging.getLogger(__name__) - -BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { - "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", - "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", - "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", - "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", - "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", - "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", - "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", - "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin", - "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", - "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", - "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", - "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", - "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", - "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", - "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", - "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin", - "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", - "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", - "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", - "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin", - "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin", - "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/pytorch_model.bin", -} - - -def load_tf_weights_in_bert(model, config, tf_checkpoint_path): - """ Load tf checkpoints in a pytorch model. - """ - try: - import re - import numpy as np - import tensorflow as tf - except ImportError: - logger.error( - "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions." - ) - raise - tf_path = os.path.abspath(tf_checkpoint_path) - logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) - # Load weights from TF model - init_vars = tf.train.list_variables(tf_path) - names = [] - arrays = [] - for name, shape in init_vars: - logger.info("Loading TF weight {} with shape {}".format(name, shape)) - array = tf.train.load_variable(tf_path, name) - names.append(name) - arrays.append(array) - - for name, array in zip(names, arrays): - name = name.split("/") - # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v - # which are not required for using pretrained model - if any( - n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] - for n in name - ): - logger.info("Skipping {}".format("/".join(name))) - continue - pointer = model - for m_name in name: - if re.fullmatch(r"[A-Za-z]+_\d+", m_name): - scope_names = re.split(r"_(\d+)", m_name) - else: - scope_names = [m_name] - if scope_names[0] == "kernel" or scope_names[0] == "gamma": - pointer = getattr(pointer, "weight") - elif scope_names[0] == "output_bias" or scope_names[0] == "beta": - pointer = getattr(pointer, "bias") - elif scope_names[0] == "output_weights": - pointer = getattr(pointer, "weight") - elif scope_names[0] == "squad": - pointer = getattr(pointer, "classifier") - else: - try: - pointer = getattr(pointer, scope_names[0]) - except AttributeError: - logger.info("Skipping {}".format("/".join(name))) - continue - if len(scope_names) >= 2: - num = int(scope_names[1]) - pointer = pointer[num] - if m_name[-11:] == "_embeddings": - pointer = getattr(pointer, "weight") - elif m_name == "kernel": - array = np.transpose(array) - try: - assert pointer.shape == array.shape - except AssertionError as e: - e.args += (pointer.shape, array.shape) - raise - logger.info("Initialize PyTorch weight {}".format(name)) - pointer.data = torch.from_numpy(array) - return model - - -def mish(x): - return x * torch.tanh(nn.functional.softplus(x)) - - -ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish} - - -BertLayerNorm = torch.nn.LayerNorm - - -class BertEmbeddings(nn.Module): - """Construct the embeddings from word, position and token_type embeddings. - """ - - def __init__(self, config): - super().__init__() - self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) - self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) - - # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load - # any TensorFlow checkpoint file - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] - - seq_length = input_shape[1] - device = input_ids.device if input_ids is not None else inputs_embeds.device - if position_ids is None: - position_ids = torch.arange(seq_length, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0).expand(input_shape) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = inputs_embeds + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings - - -class BertSelfAttention(nn.Module): - def __init__(self, config): - super().__init__() - if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads) - ) - self.output_attentions = config.output_attentions - - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(config.hidden_size, self.all_head_size) - self.key = nn.Linear(config.hidden_size, self.all_head_size) - self.value = nn.Linear(config.hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - - def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) - x = x.view(*new_x_shape) - return x.permute(0, 2, 1, 3) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - ): - mixed_query_layer = self.query(hidden_states) - - # If this is instantiated as a cross-attention module, the keys - # and values come from an encoder; the attention mask needs to be - # such that the encoder's padding tokens are not attended to. - if encoder_hidden_states is not None: - mixed_key_layer = self.key(encoder_hidden_states) - mixed_value_layer = self.value(encoder_hidden_states) - attention_mask = encoder_attention_mask - else: - mixed_key_layer = self.key(hidden_states) - mixed_value_layer = self.value(hidden_states) - - query_layer = self.transpose_for_scores(mixed_query_layer) - key_layer = self.transpose_for_scores(mixed_key_layer) - value_layer = self.transpose_for_scores(mixed_value_layer) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in BertModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - - context_layer = torch.matmul(attention_probs, value_layer) - - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(*new_context_layer_shape) - - outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) - return outputs - - -class BertSelfOutput(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertAttention(nn.Module): - def __init__(self, config): - super().__init__() - self.self = BertSelfAttention(config) - self.output = BertSelfOutput(config) - self.pruned_heads = set() - - def prune_heads(self, heads): - if len(heads) == 0: - return - mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) - heads = set(heads) - self.pruned_heads # Convert to set and remove already pruned heads - for head in heads: - # Compute how many pruned heads are before the head and move the index accordingly - head = head - sum(1 if h < head else 0 for h in self.pruned_heads) - mask[head] = 0 - mask = mask.view(-1).contiguous().eq(1) - index = torch.arange(len(mask))[mask].long() - - # Prune linear layers - self.self.query = prune_linear_layer(self.self.query, index) - self.self.key = prune_linear_layer(self.self.key, index) - self.self.value = prune_linear_layer(self.self.value, index) - self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) - - # Update hyper params and store pruned heads - self.self.num_attention_heads = self.self.num_attention_heads - len(heads) - self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads - self.pruned_heads = self.pruned_heads.union(heads) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - ): - self_outputs = self.self( - hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask - ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them - return outputs - - -class BertIntermediate(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -class BertOutput(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class BertLayer(nn.Module): - def __init__(self, config): - super().__init__() - self.attention = BertAttention(config) - self.is_decoder = config.is_decoder - if self.is_decoder: - self.crossattention = BertAttention(config) - self.intermediate = BertIntermediate(config) - self.output = BertOutput(config) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - ): - self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask) - attention_output = self_attention_outputs[0] - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights - - if self.is_decoder and encoder_hidden_states is not None: - cross_attention_outputs = self.crossattention( - attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask - ) - attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights - - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - outputs = (layer_output,) + outputs - return outputs - - -class BertEncoder(nn.Module): - def __init__(self, config): - super().__init__() - self.output_attentions = config.output_attentions - self.output_hidden_states = config.output_hidden_states - self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) - - def forward( - self, - hidden_states, - attention_mask=None, - head_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - ): - all_hidden_states = () - all_attentions = () - for i, layer_module in enumerate(self.layer): - if self.output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer_outputs = layer_module( - hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask - ) - hidden_states = layer_outputs[0] - - if self.output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - # Add last layer - if self.output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - outputs = (hidden_states,) - if self.output_hidden_states: - outputs = outputs + (all_hidden_states,) - if self.output_attentions: - outputs = outputs + (all_attentions,) - return outputs # last-layer hidden state, (all hidden states), (all attentions) - - -class BertPooler(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.activation = nn.Tanh() - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - first_token_tensor = hidden_states[:, 0] - pooled_output = self.dense(first_token_tensor) - pooled_output = self.activation(pooled_output) - return pooled_output - - -class BertPredictionHeadTransform(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - if isinstance(config.hidden_act, str): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -class BertLMPredictionHead(nn.Module): - def __init__(self, config): - super().__init__() - self.transform = BertPredictionHeadTransform(config) - - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - - # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` - self.decoder.bias = self.bias - - def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - - -class BertOnlyMLMHead(nn.Module): - def __init__(self, config): - super().__init__() - self.predictions = BertLMPredictionHead(config) - - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -class BertOnlyNSPHead(nn.Module): - def __init__(self, config): - super().__init__() - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, pooled_output): - seq_relationship_score = self.seq_relationship(pooled_output) - return seq_relationship_score - - -class BertPreTrainingHeads(nn.Module): - def __init__(self, config): - super().__init__() - self.predictions = BertLMPredictionHead(config) - self.seq_relationship = nn.Linear(config.hidden_size, 2) - - def forward(self, sequence_output, pooled_output): - prediction_scores = self.predictions(sequence_output) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - - -class BertPreTrainedModel(PreTrainedModel): - """ An abstract class to handle weights initialization and - a simple interface for downloading and loading pretrained models. - """ - - config_class = BertConfig - pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP - load_tf_weights = load_tf_weights_in_bert - base_model_prefix = "bert" - - def _init_weights(self, module): - """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - elif isinstance(module, BertLayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - -BERT_START_DOCSTRING = r""" - This model is a PyTorch `torch.nn.Module `_ sub-class. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general - usage and behavior. - Parameters: - config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. - Initializing with a config file does not load the weights associated with the model, only the configuration. - Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. -""" - -BERT_INPUTS_DOCSTRING = r""" - Args: - input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. - Indices can be obtained using :class:`transformers.BertTokenizer`. - See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.encode_plus` for details. - `What are input IDs? <../glossary.html#input-ids>`__ - attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - `What are attention masks? <../glossary.html#attention-mask>`__ - token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Segment token indices to indicate first and second portions of the inputs. - Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` - corresponds to a `sentence B` token - `What are token type IDs? <../glossary.html#token-type-ids>`_ - position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Indices of positions of each input sequence tokens in the position embeddings. - Selected in the range ``[0, config.max_position_embeddings - 1]``. - `What are position IDs? <../glossary.html#position-ids>`_ - head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): - Mask to nullify selected heads of the self-attention modules. - Mask values selected in ``[0, 1]``: - :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. - inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): - Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): - Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention - if the model is configured as a decoder. - encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Mask to avoid performing attention on the padding token indices of the encoder input. This mask - is used in the cross-attention if the model is configured as a decoder. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. -""" - - -@add_start_docstrings( - "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", - BERT_START_DOCSTRING, -) -class BertModel(BertPreTrainedModel): - """ - The model can behave as an encoder (with only self-attention) as well - as a decoder, in which case a layer of cross-attention is added between - the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, - Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. - To behave as an decoder the model needs to be initialized with the - :obj:`is_decoder` argument of the configuration set to :obj:`True`; an - :obj:`encoder_hidden_states` is expected as an input to the forward pass. - .. _`Attention is all you need`: - https://arxiv.org/abs/1706.03762 - """ - - def __init__(self, config): - super().__init__(config) - self.config = config - - self.embeddings = BertEmbeddings(config) - self.encoder = BertEncoder(config) - self.pooler = BertPooler(config) - - self.init_weights() - - def get_input_embeddings(self): - return self.embeddings.word_embeddings - - def set_input_embeddings(self, value): - self.embeddings.word_embeddings = value - - def _prune_heads(self, heads_to_prune): - """ Prunes heads of the model. - heads_to_prune: dict of {layer_num: list of heads to prune in this layer} - See base class PreTrainedModel - """ - for layer, heads in heads_to_prune.items(): - self.encoder.layer[layer].attention.prune_heads(heads) - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - ): - r""" - Return: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during pre-training. - This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertModel, BertTokenizer - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertModel.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple - """ - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - device = input_ids.device if input_ids is not None else inputs_embeds.device - - if attention_mask is None: - attention_mask = torch.ones(input_shape, device=device) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) - - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape, self.device - ) - - # If a 2D ou 3D attention mask is provided for the cross-attention - # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() - encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) - else: - encoder_extended_attention_mask = None - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - - embedding_output = self.embeddings( - input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds - ) - encoder_outputs = self.encoder( - embedding_output, - attention_mask=extended_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, - ) - sequence_output = encoder_outputs[0] - pooled_output = self.pooler(sequence_output) - - outputs = (sequence_output, pooled_output,) + encoder_outputs[ - 1: - ] # add hidden_states and attentions if they are here - return outputs # sequence_output, pooled_output, (hidden_states), (attentions) - - -@add_start_docstrings( - """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and - a `next sentence prediction (classification)` head. """, - BERT_START_DOCSTRING, -) -class BertForPreTraining(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.bert = BertModel(config) - self.cls = BertPreTrainingHeads(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - masked_lm_labels=None, - next_sentence_label=None, - ): - r""" - masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): - Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` - next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`): - Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring) - Indices should be in ``[0, 1]``. - ``0`` indicates sequence B is a continuation of sequence A, - ``1`` indicates sequence B is a random sequence. - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. - prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False - continuation before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertTokenizer, BertForPreTraining - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForPreTraining.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - prediction_scores, seq_relationship_scores = outputs[:2] - """ - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - ) - - sequence_output, pooled_output = outputs[:2] - prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) - - outputs = (prediction_scores, seq_relationship_score,) + outputs[ - 2: - ] # add hidden states and attention if they are here - - if masked_lm_labels is not None and next_sentence_label is not None: - loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - total_loss = masked_lm_loss + next_sentence_loss - outputs = (total_loss,) + outputs - - return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) - - -@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) -class BertForMaskedLM(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.bert = BertModel(config) - self.cls = BertOnlyMLMHead(config) - - self.init_weights() - - def get_output_embeddings(self): - return self.cls.predictions.decoder - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - masked_lm_labels=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - lm_labels=None, - ): - r""" - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` - lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Labels for computing the left-to-right language modeling loss (next word prediction). - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Masked language modeling loss. - ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): - Next token prediction loss. - prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertTokenizer, BertForMaskedLM - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForMaskedLM.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) - loss, prediction_scores = outputs[:2] - """ - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_attention_mask, - ) - - sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) - - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - - # Although this may seem awkward, BertForMaskedLM supports two scenarios: - # 1. If a tensor that contains the indices of masked labels is provided, - # the cross-entropy is the MLM cross-entropy that measures the likelihood - # of predictions for masked words. - # 2. If `lm_labels` is provided we are in a causal scenario where we - # try to predict the next token for each input in the decoder. - if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) - outputs = (masked_lm_loss,) + outputs - - if lm_labels is not None: - # we are doing next-token prediction; shift prediction scores and input ids by one - prediction_scores = prediction_scores[:, :-1, :].contiguous() - lm_labels = lm_labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss() - ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1)) - outputs = (ltr_lm_loss,) + outputs - - return outputs # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions) - - -@add_start_docstrings( - """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING, -) -class BertForNextSentencePrediction(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.bert = BertModel(config) - self.cls = BertOnlyNSPHead(config) - - self.init_weights() - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - next_sentence_label=None, - ): - r""" - next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) - Indices should be in ``[0, 1]``. - ``0`` indicates sequence B is a continuation of sequence A, - ``1`` indicates sequence B is a random sequence. - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): - Next sequence prediction (classification) loss. - seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): - Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertTokenizer, BertForNextSentencePrediction - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - seq_relationship_scores = outputs[0] - """ - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - ) - - pooled_output = outputs[1] - - seq_relationship_score = self.cls(pooled_output) - - outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here - if next_sentence_label is not None: - loss_fct = CrossEntropyLoss() - next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - outputs = (next_sentence_loss,) + outputs - - return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) - - -@add_start_docstrings( - """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of - the pooled output) e.g. for GLUE tasks. """, - BERT_START_DOCSTRING, -) -class BertForSequenceClassification(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for computing the sequence classification/regression loss. - Indices should be in :obj:`[0, ..., config.num_labels - 1]`. - If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), - If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): - Classification (or regression if config.num_labels==1) loss. - logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertTokenizer, BertForSequenceClassification - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForSequenceClassification.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, logits = outputs[:2] - """ - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - - if labels is not None: - if self.num_labels == 1: - # We are doing regression - loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - - return outputs # (loss), logits, (hidden_states), (attentions) - - -@add_start_docstrings( - """Bert Model with a multiple choice classification head on top (a linear layer on top of - the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, - BERT_START_DOCSTRING, -) -class BertForMultipleChoice(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, 1) - - self.init_weights() - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for computing the multiple choice classification loss. - Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension - of the input tensors. (see `input_ids` above) - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): - Classification loss. - classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): - `num_choices` is the second dimension of the input tensors. (see `input_ids` above). - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertTokenizer, BertForMultipleChoice - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForMultipleChoice.from_pretrained('bert-base-uncased') - choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] - input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices - labels = torch.tensor(1).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, classification_scores = outputs[:2] - """ - num_choices = input_ids.shape[1] - - input_ids = input_ids.view(-1, input_ids.size(-1)) - attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None - token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None - position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - ) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) - reshaped_logits = logits.view(-1, num_choices) - - outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here - - if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) - outputs = (loss,) + outputs - - return outputs # (loss), reshaped_logits, (hidden_states), (attentions) - - -@add_start_docstrings( - """Bert Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - BERT_START_DOCSTRING, -) -class BertForTokenClassification(BertPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - labels=None, - ): - r""" - labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : - Classification loss. - scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) - Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertTokenizer, BertForTokenClassification - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForTokenClassification.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, scores = outputs[:2] - """ - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - ) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output) - logits = self.classifier(sequence_output) - - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - if labels is not None: - loss_fct = CrossEntropyLoss() - # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels) - active_labels = torch.where( - active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) - ) - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - - return outputs # (loss), scores, (hidden_states), (attentions) - - -@add_start_docstrings( - """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear - layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, - BERT_START_DOCSTRING, -) -class BertForQuestionAnswering(BertPreTrainedModel): - def __init__(self, config): - super(BertForQuestionAnswering, self).__init__(config) - self.num_labels = config.num_labels - - self.bert = BertModel(config) - self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) - - self.init_weights() - - @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING) - def forward( - self, - input_ids=None, - attention_mask=None, - token_type_ids=None, - position_ids=None, - head_mask=None, - inputs_embeds=None, - start_positions=None, - end_positions=None, - ): - r""" - start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - Returns: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-start scores (before SoftMax). - end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): - Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - Examples:: - from transformers import BertTokenizer, BertForQuestionAnswering - import torch - tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') - question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - encoding = tokenizer.encode_plus(question, text) - input_ids, token_type_ids = encoding["input_ids"], encoding["token_type_ids"] - start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) - all_tokens = tokenizer.convert_ids_to_tokens(input_ids) - answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) - assert answer == "a nice puppet" - """ - - outputs = self.bert( - input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask, - inputs_embeds=inputs_embeds, - ) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - - outputs = (start_logits, end_logits,) + outputs[2:] - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - outputs = (total_loss,) + outputs - - return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/torch_bert/modeling_utils.py b/torch_bert/modeling_utils.py deleted file mode 100644 index d88e2a7..0000000 --- a/torch_bert/modeling_utils.py +++ /dev/null @@ -1,2022 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# 밑흰 2000줄짜리 코드를 언제 다 뜯어보고있누 ^^ -# 깔깔깔 -# -"""PyTorch BERT model.""" - -import logging -import os -from typing import Callable, Tuple - -import torch -from torch import Tensor, device, dtype, nn -from torch.nn import CrossEntropyLoss -from torch.nn import functional as F - -from activations import get_activation -from configuration_bert import PretrainedConfig -from file_utils import ( - DUMMY_INPUTS, - TF2_WEIGHTS_NAME, - TF_WEIGHTS_NAME, - WEIGHTS_NAME, - cached_path, - hf_bucket_url, - is_remote_url, -) - - -logger = logging.getLogger(__name__) - - -try: - from torch.nn import Identity -except ImportError: - # Older PyTorch compatibility - class Identity(nn.Module): - r"""A placeholder identity operator that is argument-insensitive. - """ - - def __init__(self, *args, **kwargs): - super().__init__() - - def forward(self, input): - return input - - -class ModuleUtilsMixin: - """ - A few utilities for torch.nn.Modules, to be used as a mixin. - """ - - def num_parameters(self, only_trainable: bool = False) -> int: - """ - Get number of (optionally, trainable) parameters in the module. - """ - params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters() - return sum(p.numel() for p in params) - - @staticmethod - def _hook_rss_memory_pre_forward(module, *args, **kwargs): - try: - import psutil - except (ImportError): - raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") - - process = psutil.Process(os.getpid()) - mem = process.memory_info() - module.mem_rss_pre_forward = mem.rss - return None - - @staticmethod - def _hook_rss_memory_post_forward(module, *args, **kwargs): - try: - import psutil - except (ImportError): - raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") - - process = psutil.Process(os.getpid()) - mem = process.memory_info() - module.mem_rss_post_forward = mem.rss - mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward - module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0) - return None - - def add_memory_hooks(self): - """ Add a memory hook before and after each sub-module forward pass to record increase in memory consumption. - Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()` - """ - for module in self.modules(): - module.register_forward_pre_hook(self._hook_rss_memory_pre_forward) - module.register_forward_hook(self._hook_rss_memory_post_forward) - self.reset_memory_hooks_state() - - def reset_memory_hooks_state(self): - for module in self.modules(): - module.mem_rss_diff = 0 - module.mem_rss_post_forward = 0 - module.mem_rss_pre_forward = 0 - - @property - def device(self) -> device: - return next(self.parameters()).device - - @property - def dtype(self) -> dtype: - return next(self.parameters()).dtype - - def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: - """type: torch.Tensor -> torch.Tensor""" - if encoder_attention_mask.dim() == 3: - encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] - if encoder_attention_mask.dim() == 2: - encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] - # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition - # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow - # /transformer/transformer_layers.py#L270 - # encoder_extended_attention_mask = (encoder_extended_attention_mask == - # encoder_extended_attention_mask.transpose(-1, -2)) - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility - encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 - return encoder_extended_attention_mask - - def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: tuple, device: device): - """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored. - Arguments: - attention_mask: torch.Tensor with 1 indicating tokens to ATTEND to - input_shape: tuple, shape of input_ids - device: torch.Device, usually self.device - Returns: - torch.Tensor with dtype of attention_mask.dtype - """ - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - if attention_mask.dim() == 3: - extended_attention_mask = attention_mask[:, None, :, :] - elif attention_mask.dim() == 2: - # Provided a padding mask of dimensions [batch_size, seq_length] - # - if the model is a decoder, apply a causal mask in addition to the padding mask - # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder: - batch_size, seq_length = input_shape - seq_ids = torch.arange(seq_length, device=device) - causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] - # causal and attention masks must have same type with pytorch version < 1.3 - causal_mask = causal_mask.to(attention_mask.dtype) - extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] - else: - extended_attention_mask = attention_mask[:, None, None, :] - else: - raise ValueError( - "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( - input_shape, attention_mask.shape - ) - ) - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 - return extended_attention_mask - - def get_head_mask(self, head_mask, num_hidden_layers): - """ - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - attention_probs has shape bsz x n_heads x N x N - Arguments: - head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads] - num_hidden_layers: int - Returns: - Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - or list with [None] for each layer - """ - if head_mask is not None: - head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) - else: - head_mask = [None] * num_hidden_layers - - return head_mask - - def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): - """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" - if head_mask.dim() == 1: - head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) - head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) - elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" - head_mask = head_mask.to(dtype=self.dtype) # switch to fload if need + fp16 compatibility - return head_mask - - -class PreTrainedModel(nn.Module, ModuleUtilsMixin): - r""" Base class for all models. - :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models - as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. - Class attributes (overridden by derived classes): - - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. - - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values. - - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: - - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`, - - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`, - - ``path``: a path (string) to the TensorFlow checkpoint. - - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. - """ - config_class = None - pretrained_model_archive_map = {} - base_model_prefix = "" - - @property - def dummy_inputs(self): - """ Dummy inputs to do a forward pass in the network. - Returns: - torch.Tensor with dummy inputs - """ - return {"input_ids": torch.tensor(DUMMY_INPUTS)} - - def __init__(self, config, *inputs, **kwargs): - super().__init__() - if not isinstance(config, PretrainedConfig): - raise ValueError( - "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " - "To create a model from a pretrained model use " - "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( - self.__class__.__name__, self.__class__.__name__ - ) - ) - # Save config in model - self.config = config - - @property - def base_model(self): - return getattr(self, self.base_model_prefix, self) - - def get_input_embeddings(self): - """ - Returns the model's input embeddings. - Returns: - :obj:`nn.Module`: - A torch module mapping vocabulary to hidden states. - """ - base_model = getattr(self, self.base_model_prefix, self) - if base_model is not self: - return base_model.get_input_embeddings() - else: - raise NotImplementedError - - def set_input_embeddings(self, value): - """ - Set model's input embeddings - Args: - value (:obj:`nn.Module`): - A module mapping vocabulary to hidden states. - """ - base_model = getattr(self, self.base_model_prefix, self) - if base_model is not self: - base_model.set_input_embeddings(value) - else: - raise NotImplementedError - - def get_output_embeddings(self): - """ - Returns the model's output embeddings. - Returns: - :obj:`nn.Module`: - A torch module mapping hidden states to vocabulary. - """ - return None # Overwrite for models with output embeddings - - def tie_weights(self): - """ - Tie the weights between the input embeddings and the output embeddings. - If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning - the weights instead. - """ - output_embeddings = self.get_output_embeddings() - if output_embeddings is not None: - self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) - - def _tie_or_clone_weights(self, output_embeddings, input_embeddings): - """ Tie or clone module weights depending of weither we are using TorchScript or not - """ - if self.config.torchscript: - output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone()) - else: - output_embeddings.weight = input_embeddings.weight - - if getattr(output_embeddings, "bias", None) is not None: - output_embeddings.bias.data = torch.nn.functional.pad( - output_embeddings.bias.data, - (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],), - "constant", - 0, - ) - if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): - output_embeddings.out_features = input_embeddings.num_embeddings - - def resize_token_embeddings(self, new_num_tokens=None): - """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. - Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. - Arguments: - new_num_tokens: (`optional`) int: - New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. - If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model. - Return: ``torch.nn.Embeddings`` - Pointer to the input tokens Embeddings Module of the model - """ - base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed - model_embeds = base_model._resize_token_embeddings(new_num_tokens) - if new_num_tokens is None: - return model_embeds - - # Update base model and current model config - self.config.vocab_size = new_num_tokens - base_model.vocab_size = new_num_tokens - - # Tie weights again if needed - self.tie_weights() - - return model_embeds - - def _resize_token_embeddings(self, new_num_tokens): - old_embeddings = self.get_input_embeddings() - new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) - self.set_input_embeddings(new_embeddings) - return self.get_input_embeddings() - - def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): - """ Build a resized Embedding Module from a provided token Embedding Module. - Increasing the size will add newly initialized vectors at the end - Reducing the size will remove vectors from the end - Args: - new_num_tokens: (`optional`) int - New number of tokens in the embedding matrix. - Increasing the size will add newly initialized vectors at the end - Reducing the size will remove vectors from the end - If not provided or None: return the provided token Embedding Module. - Return: ``torch.nn.Embeddings`` - Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None - """ - if new_num_tokens is None: - return old_embeddings - - old_num_tokens, old_embedding_dim = old_embeddings.weight.size() - if old_num_tokens == new_num_tokens: - return old_embeddings - - # Build new embeddings - new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) - new_embeddings.to(old_embeddings.weight.device) - - # initialize all new embeddings (in particular added tokens) - self._init_weights(new_embeddings) - - # Copy token embeddings from the previous weights - num_tokens_to_copy = min(old_num_tokens, new_num_tokens) - new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] - - return new_embeddings - - def init_weights(self): - """ Initialize and prunes weights if needed. """ - # Initialize weights - self.apply(self._init_weights) - - # Prune heads if needed - if self.config.pruned_heads: - self.prune_heads(self.config.pruned_heads) - - # Tie weights if needed - self.tie_weights() - - def prune_heads(self, heads_to_prune): - """ Prunes heads of the base model. - Arguments: - heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). - E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. - """ - # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads - for layer, heads in heads_to_prune.items(): - union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) - self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON - - self.base_model._prune_heads(heads_to_prune) - - def save_pretrained(self, save_directory): - """ Save a model and its configuration file to a directory, so that it - can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. - Arguments: - save_directory: directory to which to save. - """ - assert os.path.isdir( - save_directory - ), "Saving path should be a directory where the model and configuration can be saved" - - # Only save the model itself if we are using distributed training - model_to_save = self.module if hasattr(self, "module") else self - - # Attach architecture to the config - model_to_save.config.architectures = [model_to_save.__class__.__name__] - - # If we save using the predefined names, we can load using `from_pretrained` - output_model_file = os.path.join(save_directory, WEIGHTS_NAME) - - if getattr(self.config, "xla_device", False): - import torch_xla.core.xla_model as xm - - if xm.is_master_ordinal(): - # Save configuration file - model_to_save.config.save_pretrained(save_directory) - # xm.save takes care of saving only from master - xm.save(model_to_save.state_dict(), output_model_file) - else: - model_to_save.config.save_pretrained(save_directory) - torch.save(model_to_save.state_dict(), output_model_file) - - logger.info("Model weights saved in {}".format(output_model_file)) - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. - The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) - To train the model, you should first set it back in training mode with ``model.train()`` - The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. - It is up to you to train those weights with a downstream fine-tuning task. - The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. - Parameters: - pretrained_model_name_or_path: either: - - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) - model_args: (`optional`) Sequence of positional arguments: - All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) one of: - - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or - - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()` - Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - state_dict: (`optional`) dict: - an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. - This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. - cache_dir: (`optional`) string: - Path to a directory in which a downloaded pre-trained model - configuration should be cached if the standard cache should not be used. - force_download: (`optional`) boolean, default False: - Force to (re-)download the model weights and configuration files and override the cached versions if they exists. - resume_download: (`optional`) boolean, default False: - Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. - proxies: (`optional`) dict, default None: - A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. - The proxies are used on each request. - output_loading_info: (`optional`) boolean: - Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. - kwargs: (`optional`) Remaining dictionary of keyword arguments: - Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - Examples:: - # For example purposes. Not runnable. - model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. - model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` - model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading - assert model.config.output_attention == True - # Loading from a TF checkpoint file instead of a PyTorch model (slower) - config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') - model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) - """ - config = kwargs.pop("config", None) - state_dict = kwargs.pop("state_dict", None) - cache_dir = kwargs.pop("cache_dir", None) - from_tf = kwargs.pop("from_tf", False) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - output_loading_info = kwargs.pop("output_loading_info", False) - local_files_only = kwargs.pop("local_files_only", False) - - # Load config if we don't provide a configuration - if not isinstance(config, PretrainedConfig): - config_path = config if config is not None else pretrained_model_name_or_path - config, model_kwargs = cls.config_class.from_pretrained( - config_path, - *model_args, - cache_dir=cache_dir, - return_unused_kwargs=True, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - **kwargs, - ) - else: - model_kwargs = kwargs - - # Load model - if pretrained_model_name_or_path is not None: - if pretrained_model_name_or_path in cls.pretrained_model_archive_map: - archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path] - elif os.path.isdir(pretrained_model_name_or_path): - if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")): - # Load from a TF 1.0 checkpoint - archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") - elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): - # Load from a TF 2.0 checkpoint - archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) - elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): - # Load from a PyTorch checkpoint - archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) - else: - raise EnvironmentError( - "Error no file named {} found in directory {} or `from_tf` set to False".format( - [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], - pretrained_model_name_or_path, - ) - ) - elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): - archive_file = pretrained_model_name_or_path - elif os.path.isfile(pretrained_model_name_or_path + ".index"): - assert ( - from_tf - ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( - pretrained_model_name_or_path + ".index" - ) - archive_file = pretrained_model_name_or_path + ".index" - else: - archive_file = hf_bucket_url( - pretrained_model_name_or_path, postfix=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME), - ) - - # redirect to the cache, if necessary - try: - resolved_archive_file = cached_path( - archive_file, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - ) - except EnvironmentError: - if pretrained_model_name_or_path in cls.pretrained_model_archive_map: - msg = "Couldn't reach server at '{}' to download pretrained weights.".format(archive_file) - else: - msg = ( - "Model name '{}' was not found in model name list ({}). " - "We assumed '{}' was a path or url to model weight files named one of {} but " - "couldn't find any such file at this path or url.".format( - pretrained_model_name_or_path, - ", ".join(cls.pretrained_model_archive_map.keys()), - archive_file, - [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME], - ) - ) - raise EnvironmentError(msg) - - if resolved_archive_file == archive_file: - logger.info("loading weights file {}".format(archive_file)) - else: - logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file)) - else: - resolved_archive_file = None - - # Instantiate model. - model = cls(config, *model_args, **model_kwargs) - - if state_dict is None and not from_tf: - try: - state_dict = torch.load(resolved_archive_file, map_location="cpu") - except Exception: - raise OSError( - "Unable to load weights from pytorch checkpoint file. " - "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. " - ) - - missing_keys = [] - unexpected_keys = [] - error_msgs = [] - - if from_tf: - if resolved_archive_file.endswith(".index"): - # Load from a TensorFlow 1.X checkpoint - provided by original authors - model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index' - else: - # Load from our TensorFlow 2.0 checkpoints - try: - from transformers import load_tf2_checkpoint_in_pytorch_model - - model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True) - except ImportError: - logger.error( - "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " - "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." - ) - raise - else: - # Convert old format to new format if needed from a PyTorch state_dict - old_keys = [] - new_keys = [] - for key in state_dict.keys(): - new_key = None - if "gamma" in key: - new_key = key.replace("gamma", "weight") - if "beta" in key: - new_key = key.replace("beta", "bias") - if new_key: - old_keys.append(key) - new_keys.append(new_key) - for old_key, new_key in zip(old_keys, new_keys): - state_dict[new_key] = state_dict.pop(old_key) - - # copy state_dict so _load_from_state_dict can modify it - metadata = getattr(state_dict, "_metadata", None) - state_dict = state_dict.copy() - if metadata is not None: - state_dict._metadata = metadata - - # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants - # so we need to apply the function recursively. - def load(module: nn.Module, prefix=""): - local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) - module._load_from_state_dict( - state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs, - ) - for name, child in module._modules.items(): - if child is not None: - load(child, prefix + name + ".") - - # Make sure we are able to load base models as well as derived models (with heads) - start_prefix = "" - model_to_load = model - has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()) - if not hasattr(model, cls.base_model_prefix) and has_prefix_module: - start_prefix = cls.base_model_prefix + "." - if hasattr(model, cls.base_model_prefix) and not has_prefix_module: - model_to_load = getattr(model, cls.base_model_prefix) - - load(model_to_load, prefix=start_prefix) - - if model.__class__.__name__ != model_to_load.__class__.__name__: - base_model_state_dict = model_to_load.state_dict().keys() - head_model_state_dict_without_base_prefix = [ - key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys() - ] - - missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict) - - if len(missing_keys) > 0: - logger.info( - "Weights of {} not initialized from pretrained model: {}".format( - model.__class__.__name__, missing_keys - ) - ) - if len(unexpected_keys) > 0: - logger.info( - "Weights from pretrained model not used in {}: {}".format( - model.__class__.__name__, unexpected_keys - ) - ) - if len(error_msgs) > 0: - raise RuntimeError( - "Error(s) in loading state_dict for {}:\n\t{}".format( - model.__class__.__name__, "\n\t".join(error_msgs) - ) - ) - model.tie_weights() # make sure token embedding weights are still tied if needed - - # Set model in evaluation mode to deactivate DropOut modules by default - model.eval() - - if output_loading_info: - loading_info = { - "missing_keys": missing_keys, - "unexpected_keys": unexpected_keys, - "error_msgs": error_msgs, - } - return model, loading_info - - if hasattr(config, "xla_device") and config.xla_device: - import torch_xla.core.xla_model as xm - - model = xm.send_cpu_data_to_device(model, xm.xla_device()) - model = model.to(xm.xla_device()) - - return model - - def prepare_inputs_for_generation(self, input_ids, **kwargs): - return {"input_ids": input_ids} - - def prepare_scores_for_generation(self, scores, **kwargs): - return scores - - def _use_cache(self, outputs, use_cache): - """During generation, decide whether to pass the `past` variable to the next forward pass.""" - if len(outputs) <= 1 or use_cache is False: - return False - if hasattr(self.config, "mem_len") and self.config.mem_len == 0: - return False - return True - - def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty): - """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """ - for i in range(batch_size * num_beams): - for previous_token in set(prev_output_tokens[i].tolist()): - # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if lprobs[i, previous_token] < 0: - lprobs[i, previous_token] *= repetition_penalty - else: - lprobs[i, previous_token] /= repetition_penalty - - @torch.no_grad() - def generate( - self, - input_ids=None, - max_length=None, - min_length=None, - do_sample=None, - early_stopping=None, - num_beams=None, - temperature=None, - top_k=None, - top_p=None, - repetition_penalty=None, - bad_words_ids=None, - bos_token_id=None, - pad_token_id=None, - eos_token_id=None, - length_penalty=None, - no_repeat_ngram_size=None, - num_return_sequences=None, - attention_mask=None, - decoder_start_token_id=None, - use_cache=None, - ): - r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. - Adapted in part from `Facebook's XLM beam search code`_. - .. _`Facebook's XLM beam search code`: - https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529 - Parameters: - input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)` - The sequence used as a prompt for the generation. If `None` the method initializes - it as an empty `torch.LongTensor` of shape `(1,)`. - max_length: (`optional`) int - The max length of the sequence to be generated. Between `min_length` and infinity. Default to 20. - min_length: (`optional`) int - The min length of the sequence to be generated. Between 0 and infinity. Default to 0. - do_sample: (`optional`) bool - If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. - early_stopping: (`optional`) bool - if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`. - num_beams: (`optional`) int - Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1. - temperature: (`optional`) float - The value used to module the next token probabilities. Must be strictly positive. Default to 1.0. - top_k: (`optional`) int - The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. - top_p: (`optional`) float - The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. - repetition_penalty: (`optional`) float - The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0. - pad_token_id: (`optional`) int - Padding token. Default to specicic model pad_token_id or None if it does not exist. - bos_token_id: (`optional`) int - BOS token. Defaults to `bos_token_id` as defined in the models config. - eos_token_id: (`optional`) int - EOS token. Defaults to `eos_token_id` as defined in the models config. - length_penalty: (`optional`) float - Exponential penalty to the length. Default to 1. - no_repeat_ngram_size: (`optional`) int - If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once. - bad_words_ids: (`optional`) list of lists of int - `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`. - num_return_sequences: (`optional`) int - The number of independently computed returned sequences for each element in the batch. Default to 1. - attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids` - Mask to avoid performing attention on padding token indices. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - Defaults to `None`. - `What are attention masks? <../glossary.html#attention-mask>`__ - decoder_start_token_id=None: (`optional`) int - If an encoder-decoder model starts decoding with a different token than BOS. - Defaults to `None` and is changed to `BOS` later. - use_cache: (`optional`) bool - If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`. - Return: - output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)` - sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id` - Examples:: - tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. - outputs = model.generate(max_length=40) # do greedy decoding - print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) - tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer - model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. - input_context = 'The dog' - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' - for i in range(3): # 3 output sequences were generated - print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) - tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer - model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. - input_context = 'The dog' - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3) # 3 generate sequences using by sampling - for i in range(3): # 3 output sequences were generated - print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) - tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer - model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. - input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences - print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) - tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer - model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. - input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl - bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] - input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context - outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated - """ - - # We cannot generate if the model does not have a LM head - if self.get_output_embeddings() is None: - raise AttributeError( - "You tried to generate sequences with a model that does not have a LM Head." - "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" - ) - - max_length = max_length if max_length is not None else self.config.max_length - min_length = min_length if min_length is not None else self.config.min_length - do_sample = do_sample if do_sample is not None else self.config.do_sample - early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping - use_cache = use_cache if use_cache is not None else self.config.use_cache - num_beams = num_beams if num_beams is not None else self.config.num_beams - temperature = temperature if temperature is not None else self.config.temperature - top_k = top_k if top_k is not None else self.config.top_k - top_p = top_p if top_p is not None else self.config.top_p - repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty - bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id - pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id - eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id - length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty - no_repeat_ngram_size = ( - no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size - ) - bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids - num_return_sequences = ( - num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences - ) - decoder_start_token_id = ( - decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id - ) - - if input_ids is not None: - batch_size = input_ids.shape[0] # overriden by the input batch_size - else: - batch_size = 1 - - assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." - assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." - assert isinstance(do_sample, bool), "`do_sample` should be a boolean." - assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." - assert isinstance(use_cache, bool), "`use_cache` should be a boolean." - assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." - assert temperature > 0, "`temperature` should be strictly positive." - assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." - assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." - assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." - assert input_ids is not None or ( - isinstance(bos_token_id, int) and bos_token_id >= 0 - ), "If input_ids is not defined, `bos_token_id` should be a positive integer." - assert pad_token_id is None or ( - isinstance(pad_token_id, int) and (pad_token_id >= 0) - ), "`pad_token_id` should be a positive integer." - assert (eos_token_id is None) or ( - isinstance(eos_token_id, int) and (eos_token_id >= 0) - ), "`eos_token_id` should be a positive integer." - assert length_penalty > 0, "`length_penalty` should be strictly positive." - assert ( - isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 - ), "`no_repeat_ngram_size` should be a positive integer." - assert ( - isinstance(num_return_sequences, int) and num_return_sequences > 0 - ), "`num_return_sequences` should be a strictly positive integer." - assert ( - bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) - ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" - - if input_ids is None: - assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( - "you should either supply a context to complete as `input_ids` input " - "or a `bos_token_id` (integer >= 0) as a first token to start the generation." - ) - input_ids = torch.full( - (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device, - ) - else: - assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." - - # not allow to duplicate outputs when greedy decoding - if do_sample is False: - if num_beams == 1: - # no_beam_search greedy generation conditions - assert ( - num_return_sequences == 1 - ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" - - else: - # beam_search greedy generation conditions - assert ( - num_beams >= num_return_sequences - ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" - - # create attention mask if necessary - # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 - if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids): - attention_mask = input_ids.ne(pad_token_id).long() - elif attention_mask is None: - attention_mask = input_ids.new_ones(input_ids.shape) - - # set pad_token_id to eos_token_id if not set. Important that this is done after - # attention_mask is created - if pad_token_id is None and eos_token_id is not None: - logger.warning( - "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) - ) - pad_token_id = eos_token_id - - # current position and vocab size - vocab_size = self.config.vocab_size - - # set effective batch size and effective batch multiplier according to do_sample - if do_sample: - effective_batch_size = batch_size * num_return_sequences - effective_batch_mult = num_return_sequences - else: - effective_batch_size = batch_size - effective_batch_mult = 1 - - if self.config.is_encoder_decoder: - if decoder_start_token_id is None: - decoder_start_token_id = bos_token_id - - assert ( - decoder_start_token_id is not None - ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" - assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) - assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) - - # get encoder and store encoder outputs - encoder = self.get_encoder() - - encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask) - - # Expand input ids if num_beams > 1 or num_return_sequences > 1 - if num_return_sequences > 1 or num_beams > 1: - input_ids_len = input_ids.shape[-1] - input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len) - attention_mask = attention_mask.unsqueeze(1).expand( - batch_size, effective_batch_mult * num_beams, input_ids_len - ) - - input_ids = input_ids.contiguous().view( - effective_batch_size * num_beams, input_ids_len - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - attention_mask = attention_mask.contiguous().view( - effective_batch_size * num_beams, input_ids_len - ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) - - if self.config.is_encoder_decoder: - # create empty decoder_input_ids - input_ids = torch.full( - (effective_batch_size * num_beams, 1), - decoder_start_token_id, - dtype=torch.long, - device=next(self.parameters()).device, - ) - cur_len = 1 - - assert ( - batch_size == encoder_outputs[0].shape[0] - ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " - - # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) - expanded_batch_idxs = ( - torch.arange(batch_size) - .view(-1, 1) - .repeat(1, num_beams * effective_batch_mult) - .view(-1) - .to(input_ids.device) - ) - # expand encoder_outputs - encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:]) - - else: - encoder_outputs = None - cur_len = input_ids.shape[-1] - - if num_beams > 1: - output = self._generate_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - early_stopping=early_stopping, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - bos_token_id=bos_token_id, - pad_token_id=pad_token_id, - decoder_start_token_id=decoder_start_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - num_return_sequences=num_return_sequences, - length_penalty=length_penalty, - num_beams=num_beams, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - ) - else: - output = self._generate_no_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - bos_token_id=bos_token_id, - pad_token_id=pad_token_id, - decoder_start_token_id=decoder_start_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - ) - - return output - - def _generate_no_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - bos_token_id, - pad_token_id, - eos_token_id, - decoder_start_token_id, - batch_size, - encoder_outputs, - attention_mask, - use_cache, - ): - """ Generate sequences for each example without beam search (num_beams == 1). - All returned sequence are generated independantly. - """ - # length of generated sentences / unfinished sentences - unfinished_sents = input_ids.new(batch_size).fill_(1) - sent_lengths = input_ids.new(batch_size).fill_(max_length) - - past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache - ) - - outputs = self(**model_inputs) - next_token_logits = outputs[0][:, -1, :] - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - self.enforce_repetition_penalty_(next_token_logits, batch_size, 1, input_ids, repetition_penalty) - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len) - for batch_idx in range(batch_size): - next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - for batch_idx in range(batch_size): - next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf") - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - next_token_logits[:, eos_token_id] = -float("inf") - - if do_sample: - # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - # Top-p/top-k filtering - next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) - # Sample - probs = F.softmax(next_token_logits, dim=-1) - next_token = torch.multinomial(probs, num_samples=1).squeeze(1) - else: - # Greedy decoding - next_token = torch.argmax(next_token_logits, dim=-1) - - # update generations and finished sentences - if eos_token_id is not None: - # pad finished sentences if eos_token_id exist - tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) - else: - tokens_to_add = next_token - - input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) - - if eos_token_id is not None: - eos_in_sents = tokens_to_add == eos_token_id - # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length - is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() - sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len + 1) - # unfinished_sents is set to zero if eos in sentence - unfinished_sents.mul_((~eos_in_sents).long()) - - # stop when there is a in each sentence, or if we exceed the maximul length - if unfinished_sents.max() == 0: - break - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - cur_len = cur_len + 1 - - # if there are different sentences lengths in the batch, some batches have to be padded - if sent_lengths.min().item() != sent_lengths.max().item(): - assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" - # finished sents are filled with pad_token - decoded = input_ids.new(batch_size, sent_lengths.max().item()).fill_(pad_token_id) - else: - decoded = input_ids - - for hypo_idx, hypo in enumerate(input_ids): - decoded[hypo_idx, : sent_lengths[hypo_idx]] = hypo[: sent_lengths[hypo_idx]] - - return decoded - - def _generate_beam_search( - self, - input_ids, - cur_len, - max_length, - min_length, - do_sample, - early_stopping, - temperature, - top_k, - top_p, - repetition_penalty, - no_repeat_ngram_size, - bad_words_ids, - bos_token_id, - pad_token_id, - eos_token_id, - decoder_start_token_id, - batch_size, - num_return_sequences, - length_penalty, - num_beams, - vocab_size, - encoder_outputs, - attention_mask, - use_cache, - ): - """ Generate sequences for each example with beam search. - """ - - # generated hypotheses - generated_hyps = [ - BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) - for _ in range(batch_size) - ] - - # scores for each sentence in the beam - beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) - - # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times - if do_sample is False: - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) - - # cache compute states - past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models - - # done sentences - done = [False for _ in range(batch_size)] - - while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation( - input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache - ) - outputs = self(**model_inputs) # (batch_size * num_beams, cur_len, vocab_size) - next_token_logits = outputs[0][:, -1, :] # (batch_size * num_beams, vocab_size) - - # if model has past, then set the past variable to speed up decoding - if self._use_cache(outputs, use_cache): - past = outputs[1] - - # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) - if repetition_penalty != 1.0: - self.enforce_repetition_penalty_( - next_token_logits, batch_size, num_beams, input_ids, repetition_penalty, - ) - - if temperature != 1.0: - next_token_logits = next_token_logits / temperature - - scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) - if self.config.is_encoder_decoder and do_sample is False: - # TODO (PVP) still a bit hacky here - there might be a better solutino - scores = self.prepare_scores_for_generation(scores, cur_len=cur_len, max_length=max_length) - - # set eos token prob to zero if min_length is not reached - if eos_token_id is not None and cur_len < min_length: - scores[:, eos_token_id] = -float("inf") - - if no_repeat_ngram_size > 0: - # calculate a list of banned tokens to prevent repetitively generating the same ngrams - num_batch_hypotheses = batch_size * num_beams - # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345 - banned_batch_tokens = calc_banned_ngram_tokens( - input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len - ) - for i, banned_tokens in enumerate(banned_batch_tokens): - scores[i, banned_tokens] = -float("inf") - - if bad_words_ids is not None: - # calculate a list of banned tokens according to bad words - banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids) - - for i, banned_tokens in enumerate(banned_tokens): - scores[i, banned_tokens] = -float("inf") - - assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format( - scores.shape, (batch_size * num_beams, vocab_size) - ) - - if do_sample: - _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) - # Top-p/top-k filtering - _scores = top_k_top_p_filtering( - _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 - ) # (batch_size * num_beams, vocab_size) - # re-organize to group the beam together to sample from all beam_idxs - _scores = _scores.contiguous().view( - batch_size, num_beams * vocab_size - ) # (batch_size, num_beams * vocab_size) - - # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) - probs = F.softmax(_scores, dim=-1) - next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) # (batch_size, num_beams * 2) - # Compute next scores - next_scores = torch.gather(_scores, -1, next_tokens) # (batch_size, num_beams * 2) - # sort the sampled vector to make sure that the first num_beams samples are the best - next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1) - next_tokens = torch.gather(next_tokens, -1, next_scores_indices) # (batch_size, num_beams * 2) - - else: - next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) - - # re-organize to group the beam together (we are keeping top hypothesis accross beams) - next_scores = next_scores.view( - batch_size, num_beams * vocab_size - ) # (batch_size, num_beams * vocab_size) - - next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True) - - assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) - - # next batch beam content - next_batch_beam = [] - - # for each sentence - for batch_idx in range(batch_size): - - # if we are done with this sentence - if done[batch_idx]: - assert ( - len(generated_hyps[batch_idx]) >= num_beams - ), "Batch can only be done if at least {} beams have been generated".format(num_beams) - assert ( - eos_token_id is not None and pad_token_id is not None - ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" - next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch - continue - - # next sentence beam content - next_sent_beam = [] - - # next tokens for this sentence - for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( - zip(next_tokens[batch_idx], next_scores[batch_idx]) - ): - # get beam and token IDs - beam_id = beam_token_id // vocab_size - token_id = beam_token_id % vocab_size - - effective_beam_id = batch_idx * num_beams + beam_id - # add to generated hypotheses if end of sentence or last iteration - if (eos_token_id is not None) and (token_id.item() == eos_token_id): - # if beam_token does not belong to top num_beams tokens, it should not be added - is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams - if is_beam_token_worse_than_top_num_beams: - continue - generated_hyps[batch_idx].add( - input_ids[effective_beam_id].clone(), beam_token_score.item(), - ) - else: - # add next predicted token if it is not eos_token - next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) - - # the beam for next step is full - if len(next_sent_beam) == num_beams: - break - - # Check if were done so that we can save a pad step if all(done) - done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( - next_scores[batch_idx].max().item(), cur_len=cur_len - ) - - # update next beam content - assert len(next_sent_beam) == num_beams, "Beam should always be full" - next_batch_beam.extend(next_sent_beam) - assert len(next_batch_beam) == num_beams * (batch_idx + 1) - - # stop when we are done with each sentence - if all(done): - break - - # sanity check / prepare next batch - assert len(next_batch_beam) == batch_size * num_beams - beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) - beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) - beam_idx = input_ids.new([x[2] for x in next_batch_beam]) - - # re-order batch - input_ids = input_ids[beam_idx, :] - input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) - # re-order internal states - if past is not None: - past = self._reorder_cache(past, beam_idx) - - # extend attention_mask for new generated input if only decoder - if self.config.is_encoder_decoder is False: - attention_mask = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - # update current length - cur_len = cur_len + 1 - - # finalize all open beam hypotheses and end to generated hypotheses - for batch_idx in range(batch_size): - if done[batch_idx]: - continue - - # test that beam scores match previously calculated scores if not eos and batch_idx not done - if eos_token_id is not None and all( - (token_id % vocab_size).item() is not eos_token_id for token_id in next_tokens[batch_idx] - ): - assert torch.all( - next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx] - ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( - next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx], - ) - - # need to add best num_beams hypotheses to generated hyps - for beam_id in range(num_beams): - effective_beam_id = batch_idx * num_beams + beam_id - final_score = beam_scores[effective_beam_id].item() - final_tokens = input_ids[effective_beam_id] - generated_hyps[batch_idx].add(final_tokens, final_score) - - # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch - output_batch_size = batch_size if do_sample else batch_size * num_return_sequences - output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences - - # select the best hypotheses - sent_lengths = input_ids.new(output_batch_size) - best = [] - - # retrieve best hypotheses - for i, hypotheses in enumerate(generated_hyps): - sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) - for j in range(output_num_return_sequences_per_batch): - effective_batch_idx = output_num_return_sequences_per_batch * i + j - best_hyp = sorted_hyps.pop()[1] - sent_lengths[effective_batch_idx] = len(best_hyp) - best.append(best_hyp) - - # shorter batches are filled with pad_token - if sent_lengths.min().item() != sent_lengths.max().item(): - assert pad_token_id is not None, "`Pad_token_id` has to be defined" - sent_max_len = min(sent_lengths.max().item() + 1, max_length) - decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id) - - # fill with hypothesis and eos_token_id if necessary - for i, hypo in enumerate(best): - decoded[i, : sent_lengths[i]] = hypo - if sent_lengths[i] < max_length: - decoded[i, sent_lengths[i]] = eos_token_id - else: - # none of the hypotheses have an eos_token - assert (len(hypo) == max_length for hypo in best) - decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device) - - return decoded - - # force one of token_ids to be generated by setting prob of all other tokens to 0. - def _force_token_ids_generation(self, scores, token_ids): - if isinstance(token_ids, int): - token_ids = [token_ids] - all_but_token_ids_mask = torch.tensor( - [x for x in range(self.config.vocab_size) if x not in token_ids], - dtype=torch.long, - device=next(self.parameters()).device, - ) - assert len(scores.shape) == 2, "scores should be of rank 2 with shape: [batch_size, vocab_size]" - scores[:, all_but_token_ids_mask] = -float("inf") - - @staticmethod - def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]: - return tuple(layer_past.index_select(1, beam_idx) for layer_past in past) - - -def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None: - """Copied from fairseq for no_repeat_ngram in beam_search""" - if cur_len + 1 < no_repeat_ngram_size: - # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet - return [[] for _ in range(num_hypos)] - generated_ngrams = [{} for _ in range(num_hypos)] - for idx in range(num_hypos): - gen_tokens = prev_input_ids[idx].tolist() - generated_ngram = generated_ngrams[idx] - for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): - prev_ngram_tuple = tuple(ngram[:-1]) - generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] - - def _get_generated_ngrams(hypo_idx): - # Before decoding the next token, prevent decoding of ngrams that have already appeared - start_idx = cur_len + 1 - no_repeat_ngram_size - ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist()) - return generated_ngrams[hypo_idx].get(ngram_idx, []) - - banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] - return banned_tokens - - -def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids): - banned_tokens = [] - - def _tokens_match(prev_tokens, tokens): - if len(tokens) == 0: - # if bad word tokens is just one token always ban it - return True - if len(tokens) > len(prev_input_ids): - # if bad word tokens are longer then prev input_ids they can't be equal - return False - - if prev_tokens[-len(tokens) :] == tokens: - # if tokens match - return True - else: - return False - - for prev_input_ids_slice in prev_input_ids: - banned_tokens_slice = [] - - for banned_token_seq in bad_words_ids: - assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format( - bad_words_ids - ) - - if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False: - # if tokens do not match continue - continue - - banned_tokens_slice.append(banned_token_seq[-1]) - - banned_tokens.append(banned_tokens_slice) - - return banned_tokens - - -def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1): - """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - Args: - logits: logits distribution shape (batch size, vocabulary size) - if top_k > 0: keep only top k tokens with highest probability (top-k filtering). - if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). - Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - Make sure we keep at least min_tokens_to_keep per batch example in the output - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - if top_k > 0: - top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] - logits[indices_to_remove] = filter_value - - if top_p < 1.0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) - - # Remove tokens with cumulative probability above the threshold (token with 0 are kept) - sorted_indices_to_remove = cumulative_probs > top_p - if min_tokens_to_keep > 1: - # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) - sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) - logits[indices_to_remove] = filter_value - return logits - - -class BeamHypotheses(object): - def __init__(self, num_beams, max_length, length_penalty, early_stopping): - """ - Initialize n-best list of hypotheses. - """ - self.max_length = max_length - 1 # ignoring bos_token - self.length_penalty = length_penalty - self.early_stopping = early_stopping - self.num_beams = num_beams - self.beams = [] - self.worst_score = 1e9 - - def __len__(self): - """ - Number of hypotheses in the list. - """ - return len(self.beams) - - def add(self, hyp, sum_logprobs): - """ - Add a new hypothesis to the list. - """ - score = sum_logprobs / len(hyp) ** self.length_penalty - if len(self) < self.num_beams or score > self.worst_score: - self.beams.append((score, hyp)) - if len(self) > self.num_beams: - sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) - del self.beams[sorted_scores[0][1]] - self.worst_score = sorted_scores[1][0] - else: - self.worst_score = min(score, self.worst_score) - - def is_done(self, best_sum_logprobs, cur_len=None): - """ - If there are enough hypotheses and that none of the hypotheses being generated - can become better than the worst one in the heap, then we are done with this sentence. - """ - - if len(self) < self.num_beams: - return False - elif self.early_stopping: - return True - else: - if cur_len is None: - cur_len = self.max_length - cur_score = best_sum_logprobs / cur_len ** self.length_penalty - ret = self.worst_score >= cur_score - return ret - - -class Conv1D(nn.Module): - def __init__(self, nf, nx): - """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) - Basically works like a Linear layer but the weights are transposed - """ - super().__init__() - self.nf = nf - w = torch.empty(nx, nf) - nn.init.normal_(w, std=0.02) - self.weight = nn.Parameter(w) - self.bias = nn.Parameter(torch.zeros(nf)) - - def forward(self, x): - size_out = x.size()[:-1] + (self.nf,) - x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) - x = x.view(*size_out) - return x - - -class PoolerStartLogits(nn.Module): - """ Compute SQuAD start_logits from sequence hidden states. """ - - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, 1) - - def forward(self, hidden_states, p_mask=None): - """ Args: - **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)` - invalid position mask such as query and special symbols (PAD, SEP, CLS) - 1.0 means token should be masked. - """ - x = self.dense(hidden_states).squeeze(-1) - - if p_mask is not None: - if next(self.parameters()).dtype == torch.float16: - x = x * (1 - p_mask) - 65500 * p_mask - else: - x = x * (1 - p_mask) - 1e30 * p_mask - - return x - - -class PoolerEndLogits(nn.Module): - """ Compute SQuAD end_logits from sequence hidden states and start token hidden state. - """ - - def __init__(self, config): - super().__init__() - self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) - self.activation = nn.Tanh() - self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.dense_1 = nn.Linear(config.hidden_size, 1) - - def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None): - """ Args: - One of ``start_states``, ``start_positions`` should be not None. - If both are set, ``start_positions`` overrides ``start_states``. - **start_states**: ``torch.LongTensor`` of shape identical to hidden_states - hidden states of the first tokens for the labeled span. - **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` - position of the first token for the labeled span: - **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` - Mask of invalid position such as query and special symbols (PAD, SEP, CLS) - 1.0 means token should be masked. - """ - assert ( - start_states is not None or start_positions is not None - ), "One of start_states, start_positions should be not None" - if start_positions is not None: - slen, hsz = hidden_states.shape[-2:] - start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) - start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) - start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) - - x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) - x = self.activation(x) - x = self.LayerNorm(x) - x = self.dense_1(x).squeeze(-1) - - if p_mask is not None: - if next(self.parameters()).dtype == torch.float16: - x = x * (1 - p_mask) - 65500 * p_mask - else: - x = x * (1 - p_mask) - 1e30 * p_mask - - return x - - -class PoolerAnswerClass(nn.Module): - """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ - - def __init__(self, config): - super().__init__() - self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) - self.activation = nn.Tanh() - self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) - - def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None): - """ - Args: - One of ``start_states``, ``start_positions`` should be not None. - If both are set, ``start_positions`` overrides ``start_states``. - **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``. - hidden states of the first tokens for the labeled span. - **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` - position of the first token for the labeled span. - **cls_index**: torch.LongTensor of shape ``(batch_size,)`` - position of the CLS token. If None, take the last token. - note(Original repo): - no dependency on end_feature so that we can obtain one single `cls_logits` - for each sample - """ - hsz = hidden_states.shape[-1] - assert ( - start_states is not None or start_positions is not None - ), "One of start_states, start_positions should be not None" - if start_positions is not None: - start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) - start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) - - if cls_index is not None: - cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) - cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) - else: - cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) - - x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) - x = self.activation(x) - x = self.dense_1(x).squeeze(-1) - - return x - - -class SQuADHead(nn.Module): - r""" A SQuAD head inspired by XLNet. - Parameters: - config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model. - Inputs: - **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)`` - hidden states of sequence tokens - **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` - position of the first token for the labeled span. - **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` - position of the last token for the labeled span. - **cls_index**: torch.LongTensor of shape ``(batch_size,)`` - position of the CLS token. If None, take the last token. - **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)`` - Whether the question has a possible answer in the paragraph or not. - **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` - Mask of invalid position such as query and special symbols (PAD, SEP, CLS) - 1.0 means token should be masked. - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: - Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. - **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) - ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)`` - Log probabilities for the top config.start_n_top start token possibilities (beam-search). - **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) - ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)`` - Indices for the top config.start_n_top start token possibilities (beam-search). - **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) - ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` - Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). - **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) - ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` - Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). - **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) - ``torch.FloatTensor`` of shape ``(batch_size,)`` - Log probabilities for the ``is_impossible`` label of the answers. - """ - - def __init__(self, config): - super().__init__() - self.start_n_top = config.start_n_top - self.end_n_top = config.end_n_top - - self.start_logits = PoolerStartLogits(config) - self.end_logits = PoolerEndLogits(config) - self.answer_class = PoolerAnswerClass(config) - - def forward( - self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None, - ): - outputs = () - - start_logits = self.start_logits(hidden_states, p_mask=p_mask) - - if start_positions is not None and end_positions is not None: - # If we are on multi-GPU, let's remove the dimension added by batch splitting - for x in (start_positions, end_positions, cls_index, is_impossible): - if x is not None and x.dim() > 1: - x.squeeze_(-1) - - # during training, compute the end logits based on the ground truth of the start position - end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) - - loss_fct = CrossEntropyLoss() - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - - if cls_index is not None and is_impossible is not None: - # Predict answerability from the representation of CLS and START - cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) - loss_fct_cls = nn.BCEWithLogitsLoss() - cls_loss = loss_fct_cls(cls_logits, is_impossible) - - # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss - total_loss += cls_loss * 0.5 - - outputs = (total_loss,) + outputs - - else: - # during inference, compute the end logits based on beam search - bsz, slen, hsz = hidden_states.size() - start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) - - start_top_log_probs, start_top_index = torch.topk( - start_log_probs, self.start_n_top, dim=-1 - ) # shape (bsz, start_n_top) - start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) - start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) - start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) - - hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( - start_states - ) # shape (bsz, slen, start_n_top, hsz) - p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None - end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) - end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) - - end_top_log_probs, end_top_index = torch.topk( - end_log_probs, self.end_n_top, dim=1 - ) # shape (bsz, end_n_top, start_n_top) - end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) - end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) - - start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) - cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) - - outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits,) + outputs - - # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits - # or (if labels are provided) (total_loss,) - return outputs - - -class SequenceSummary(nn.Module): - r""" Compute a single vector summary of a sequence hidden states according to various possibilities: - Args of the config class: - summary_type: - - 'last' => [default] take the last token hidden state (like XLNet) - - 'first' => take the first token hidden state (like Bert) - - 'mean' => take the mean of all tokens hidden states - - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) - - 'attn' => Not implemented now, use multi-head attention - summary_use_proj: Add a projection after the vector extraction - summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. - summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default - summary_first_dropout: Add a dropout before the projection and activation - summary_last_dropout: Add a dropout after the projection and activation - """ - - def __init__(self, config: PretrainedConfig): - super().__init__() - - self.summary_type = getattr(config, "summary_type", "last") - if self.summary_type == "attn": - # We should use a standard multi-head attention module with absolute positional embedding for that. - # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 - # We can probably just use the multi-head attention module of PyTorch >=1.1.0 - raise NotImplementedError - - self.summary = Identity() - if hasattr(config, "summary_use_proj") and config.summary_use_proj: - if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: - num_classes = config.num_labels - else: - num_classes = config.hidden_size - self.summary = nn.Linear(config.hidden_size, num_classes) - - activation_string = getattr(config, "summary_activation", None) - self.activation: Callable = (get_activation(activation_string) if activation_string else Identity()) - - self.first_dropout = Identity() - if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: - self.first_dropout = nn.Dropout(config.summary_first_dropout) - - self.last_dropout = Identity() - if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: - self.last_dropout = nn.Dropout(config.summary_last_dropout) - - def forward(self, hidden_states, cls_index=None): - """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer. - cls_index: [optional] position of the classification token if summary_type == 'cls_index', - shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. - if summary_type == 'cls_index' and cls_index is None: - we take the last token of the sequence as classification token - """ - if self.summary_type == "last": - output = hidden_states[:, -1] - elif self.summary_type == "first": - output = hidden_states[:, 0] - elif self.summary_type == "mean": - output = hidden_states.mean(dim=1) - elif self.summary_type == "cls_index": - if cls_index is None: - cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,) - else: - cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) - cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) - # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states - output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) - elif self.summary_type == "attn": - raise NotImplementedError - - output = self.first_dropout(output) - output = self.summary(output) - output = self.activation(output) - output = self.last_dropout(output) - - return output - - -def create_position_ids_from_input_ids(input_ids, padding_idx): - """ Replace non-padding symbols with their position numbers. Position numbers begin at - padding_idx+1. Padding symbols are ignored. This is modified from fairseq's - `utils.make_positions`. - :param torch.Tensor x: - :return torch.Tensor: - """ - # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. - mask = input_ids.ne(padding_idx).int() - incremental_indicies = torch.cumsum(mask, dim=1).type_as(mask) * mask - return incremental_indicies.long() + padding_idx - - -def prune_linear_layer(layer, index, dim=0): - """ Prune a linear layer (a model parameters) to keep only entries in index. - Return the pruned layer as a new layer with requires_grad=True. - Used to remove heads. - """ - index = index.to(layer.weight.device) - W = layer.weight.index_select(dim, index).clone().detach() - if layer.bias is not None: - if dim == 1: - b = layer.bias.clone().detach() - else: - b = layer.bias[index].clone().detach() - new_size = list(layer.weight.size()) - new_size[dim] = len(index) - new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) - new_layer.weight.requires_grad = False - new_layer.weight.copy_(W.contiguous()) - new_layer.weight.requires_grad = True - if layer.bias is not None: - new_layer.bias.requires_grad = False - new_layer.bias.copy_(b.contiguous()) - new_layer.bias.requires_grad = True - return new_layer - - -def prune_conv1d_layer(layer, index, dim=1): - """ Prune a Conv1D layer (a model parameters) to keep only entries in index. - A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed. - Return the pruned layer as a new layer with requires_grad=True. - Used to remove heads. - """ - index = index.to(layer.weight.device) - W = layer.weight.index_select(dim, index).clone().detach() - if dim == 0: - b = layer.bias.clone().detach() - else: - b = layer.bias[index].clone().detach() - new_size = list(layer.weight.size()) - new_size[dim] = len(index) - new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) - new_layer.weight.requires_grad = False - new_layer.weight.copy_(W.contiguous()) - new_layer.weight.requires_grad = True - new_layer.bias.requires_grad = False - new_layer.bias.copy_(b.contiguous()) - new_layer.bias.requires_grad = True - return new_layer - - -def prune_layer(layer, index, dim=None): - """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index. - Return the pruned layer as a new layer with requires_grad=True. - Used to remove heads. - """ - if isinstance(layer, nn.Linear): - return prune_linear_layer(layer, index, dim=0 if dim is None else dim) - elif isinstance(layer, Conv1D): - return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) - else: - raise ValueError("Can't prune layer of class {}".format(layer.__class__)) diff --git a/torch_bert/test.py b/torch_bert/test.py deleted file mode 100644 index 98ea690..0000000 --- a/torch_bert/test.py +++ /dev/null @@ -1,31 +0,0 @@ -class Test: - # kwargs pop test - 200420 - def __init__(self, **kwargs): - print(kwargs) - self.pos = kwargs.pop('pos', 5) - self.image = kwargs.pop('image', 'i love you') - - # class_method argument feeding test - 200420 - @classmethod - def from_pretrained(cls, *input, **kwargs): - return cls._from_pretrained(*input, **kwargs) - - @classmethod - def _from_pretrained(cls, pretrained_model_name, cache_dir=None, *input, **kwargs): - print(cls.prep) - print(pretrained_model_name) - print(cache_dir) - return None - -class BertTest(Test): - - prep = ['lol lol lol'] - - def __init__(self, **kwargs): - super().__init__() - - -if __name__ == '__main__': - B = BertTest() - B.from_pretrained('a') - print(5 * [0]) diff --git a/torch_bert/tokenization_bert.py b/torch_bert/tokenization_bert.py deleted file mode 100644 index f8f40ec..0000000 --- a/torch_bert/tokenization_bert.py +++ /dev/null @@ -1,495 +0,0 @@ -# https://mrcoding.tistory.com/entry/아톰에서-파이썬-스크립트-실행시-한글-깨짐현상-잡는-꿀팁 -import sys -import io -sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8') -sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8') - -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -# 형태소분석 기반 BERT를 위한 Tokenization Class -# 수정: joonho.lim -# 일자: 2019-05-23 -# -# -# Morph와 Eojeol 버전 통합 -# 수정: MyungHoon.jin -# 일자: 2020-04-20 - -import collections -import logging -import os -import unicodedata -from typing import List, Optional - -from tokenization_utils import PretrainedTokenizer - -# Huggingface 소스 파일 -# VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} -# -# PRETRAINED_VOCAB_FILES_MAP = { -# "vocab_file": { -# "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", -# "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", -# "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", -# "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", -# "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", -# "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", -# "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", -# "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", -# "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", -# "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", -# "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", -# "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", -# "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", -# "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", -# "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", -# "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", -# "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", -# "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt", -# } -# } -# -# PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { -# "bert-base-uncased": 512, -# "bert-large-uncased": 512, -# "bert-base-cased": 512, -# "bert-large-cased": 512, -# "bert-base-multilingual-uncased": 512, -# "bert-base-multilingual-cased": 512, -# "bert-base-chinese": 512, -# "bert-base-german-cased": 512, -# "bert-large-uncased-whole-word-masking": 512, -# "bert-large-cased-whole-word-masking": 512, -# "bert-large-uncased-whole-word-masking-finetuned-squad": 512, -# "bert-large-cased-whole-word-masking-finetuned-squad": 512, -# "bert-base-cased-finetuned-mrpc": 512, -# "bert-base-german-dbmdz-cased": 512, -# "bert-base-german-dbmdz-uncased": 512, -# "bert-base-finnish-cased-v1": 512, -# "bert-base-finnish-uncased-v1": 512, -# "bert-base-dutch-cased": 512, -# } -# -# PRETRAINED_INIT_CONFIGURATION = { -# "bert-base-uncased": {"do_lower_case": True}, -# "bert-large-uncased": {"do_lower_case": True}, -# "bert-base-cased": {"do_lower_case": False}, -# "bert-large-cased": {"do_lower_case": False}, -# "bert-base-multilingual-uncased": {"do_lower_case": True}, -# "bert-base-multilingual-cased": {"do_lower_case": False}, -# "bert-base-chinese": {"do_lower_case": False}, -# "bert-base-german-cased": {"do_lower_case": False}, -# "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, -# "bert-large-cased-whole-word-masking": {"do_lower_case": False}, -# "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, -# "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, -# "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, -# "bert-base-german-dbmdz-cased": {"do_lower_case": False}, -# "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, -# "bert-base-finnish-cased-v1": {"do_lower_case": False}, -# "bert-base-finnish-uncased-v1": {"do_lower_case": True}, -# "bert-base-dutch-cased": {"do_lower_case": False}, -# } - -logger = logging.getLogger(__name__) - -PRETRAINED_VOCAB_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", -} -PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { - 'bert-base-uncased': 512, - 'bert-large-uncased': 512, - 'bert-base-cased': 512, - 'bert-large-cased': 512, - 'bert-base-multilingual-uncased': 512, - 'bert-base-multilingual-cased': 512, - 'bert-base-chinese': 512, -} -VOCAB_NAME = 'vocab.txt' - - -def load_vocab(vocab_file, encoding="utf-8"): - vocab = collections.OrderedDict() - index = 0 - # huggingface 코드에서는 단순하게 `.readlines()` 메서드로 구현 - with open(vocab_file, "r", encoding=encoding) as reader: - while True: - token = reader.readline() - # token = convert_to_unicode(token) - if not token: - break - # ETRI Vocab을 위한 코드 - if token.find('n_iters=') == 0 or token.find('max_length=') == 0: - continue - # index 1은 빈도수, 빈도수가 제일 높은 token부터 numbering - token = token.split('\t')[0].strip() - vocab[token] = index - index += 1 - return vocab - - -# text 단위 공백 처리 -def whitespace_tokenize(text): - """Run basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class BertTokenizer(PretrainedTokenizer): - - vocab_file_names = VOCAB_NAME - pretrained_vocab_files_map = PRETRAINED_VOCAB_ARCHIVE_MAP - # pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION - max_model_input_sizes = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP - - def __init__(self, - vocab_file, - do_lower_case=False, - do_basic_tokenize=True, - never_split=None, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]", - **kwargs): - super().__init__( - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - **kwargs, - ) - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) - ) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict( - [(ids, token) for token, ids in self.vocab.items()] - ) - self.do_basic_tokenize = do_basic_tokenize - if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, - never_split=never_split, - ) - self.wordpiece_tokenizer = WordpieceTokenizer( - vocab=self.vocab, - unk_token=self.unk_token - ) - - @property - def vocab_size(self): - return len(self.vocab) - - def get_vocab(self): - # added_tokens_encoder는 추가할 때 필요, default == {} - return dict(self.vocab, **self.added_tokens_encoder) - - def tokenize(self, text): - split_tokens = [] - if self.do_basic_tokenize: - for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - token += '_' # ETRI BERT에서의 차이점. - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) - return split_tokens - - def build_inputs_with_special_tokens(self, - token_ids_0: List[int], token_ids_1: Optional[List[int]]=None - ) -> List[int]: - """ - sequence 분류 task를 위한 model input build! - - single sequence: ``[CLS] A [SEP]`` - - pair of sequence: ``[CLS] A [SEP] B [SEP]`` - """ - cls = [self.cls_token_id] - sep = [self.sep_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask(self, - token_ids_0: List[int], token_ids_1: Optional[List[int]]=None, - already_has_special_tokens: bool=False) -> List[int]: - """ - special token이 추가되지 않은 list에서 sequence ids를 검색 - ``prepare_for_model``, ``encode_plus`` 메서드로 special tokens을 - 추가할 때 호출됨 - """ - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def create_token_type_ids_from_sequences(self, - token_ids_0: List[int], token_ids_1: Optional[List[int]]=None - ) -> List[int]: - """ - sequence pair 분류 문제를 위해 concat mask를 생성 - - :: - - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - - 만일 token_ids_1이 None이면 0으로 채워진 mask를 반환 - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - - def save_vocabulary(self, vocab_path): - pass - - -class BasicTokenizer: - - """Run basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, do_lower_case=False, never_split=[], - tokenize_chinese_chars=True): - self.do_lower_case = do_lower_case - self.never_split = never_split - self.tokenize_chinese_chars = tokenize_chinese_chars - - def tokenize(self, text, never_split=[]): - never_split = self.never_split + never_split - text = self._clean_text(text) - # Chinese Char은 무시한다. - orig_token = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case and token not in self.never_split: - # 형태소 분석기를 사용할 경우 do_lower_case를 False로 설정할 것. - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - output_tokens = whitespace_tokenize(" ".join(split_token)) - return output_tokens - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] # char을 저장한 list 생성 - for char in text: - # 텍스트에서 char 단위로 출력 - cp = ord(char) - if cp == 0 or cp == 0xfffd or self._is_control(char): - # \x00이거나 �이거나 unicode cat.이 C로 시작할 경우 - # (개행문자 제외) output에 추가하지 않는다. - continue - if self._is_whitespace(char): - # 공백일 경우 " "으로 output에 추가 - output.append(" ") - else: - # 이 외의 경우 전부 output에 추가 - output.append(char) - # cleaning 작업을 거친 text를 후처리하여 반환 - return "".join(output) - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d - # 모든 음절을 정준 분해(Canonical Decomposition)시킴 - # `각`을 `ㄱ+ㅏ+ㄱ`으로 저장(출력되는 값은 동일) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - # unicode category가 "Mark, Nonspacing"일 경우 pass - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - # 근데 사실상 whitespacing을 하고 ETIR가 _is_punctuation 함수를 - # 띄어쓰기만 검색하도록 만들어놔서 사실 의미없음 ㅇㅅㅇ - if never_split is not None and text in never_split: - return [text] - chars = list(text) - i, start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if self._is_punctuation(char): - # 구두점일 경우 [char}을 추가하고 새로운 단어로 시작 - output.append([char]) - start_new_word = True - else: - # 구두점이 아닐 경우 - if start_new_word: - # 새로운 단어로 시작할 경우에 빈 리스트 추가 - output.append([]) - # 해당 문자부터 시작하도록 start_new_word는 False로 setting - start_new_word = False - # 위에 추가한 빈 리스트에 각각 character를 채워넣음 - output[-1].append(char) - i += 1 - return ["".join(x) for x in output] - - # char 단위 함수들 ------------------------------------------------------ - @staticmethod - def _is_whitespace(char): - """Checks whether `chars` is a whitespace character""" - # \t, \n, \r은 technically control characters지만 - # whiteapce로 여기고 이를 처리 - if char == " " or char == '\t' or char == '\n' or char == '\r': - return True - cat = unicodedata.category(char) - if cat == 'Zs': - # unicode category가 Space Seperator면 True 반환 - return True - # 이 외의 경우 전부 False 반환 - return False - - @staticmethod - def _is_control(char): - """Checks whether `chars` is a control character""" - if char == "\t" or char == "\n" or char == "\r": - # \t, \n, \r을 우리는 whitespace로 처리함 - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - # unicode category가 - # Cc(Control) - # Cf(format) - # Co(Private Use, is 0) - # Cs(Surrrogate, is 0)일 경우, True 반환 - return True - # 이 외의 경우 전부 False 반환 - return False - - @staticmethod - def _is_punctuation(char): - """Checks whether `chars` is a punctuatoin character.""" - # 왜 때문인지 모르겠지만 ETRI에서 아래부분을 주석처리해버림 - # 구두점을 띄어쓰기만 고려? 흠... - return char == ' ' - - cp = ord(char) - # 모든 non-letter/number ASCII를 구두점으로 처리 - # "^", "$", "`"와 같은 char은 unicode에 없음 - # 그러나 이를 일관성있게 punctuation으로 처리하기 위해 아래와 같이 처리 - if ((cp >= 33 and cp <= 47) or - (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or - (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False - - -class WordpieceTokenizer: - - """Runs WordPiece tokenization""" - - def __init__(self, vocab, unk_token, max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """ - greedy longest-match-first algorithm을 사용하여 - 주어진 vocab으로 tokenization을 수행 - - 20.04.20 - - 여기에 기능 추가해야함!! -> 없는 토큰 추가 학습하도록 - - 미리 빼둬야함!! - """ - # text = convert_to_unicode(text) - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - # max word로 설정한 글자 수를 넘길 경우 [UNK] 처리 - output.tokens.append(self.unk_token) - continue - is_bad, start = False, 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - # 첫 번째 글자부터 천천히 vocab에 있는 단어인지 체크 - # 맨 처음에는 해당 token자체가 이미 있는지 체크! (때문에 longest) - while start < end: - substr = "".join(chars[start:end]) - # Canonical Decomposition 과정을 거쳤기 때문에 - # 이를 다시 Composition해줘야 vocab의 단어와 비교 가능 - substr = unicodedata.normalize("NFC", substr) - # - # if start > 0: - # substr = "##" + substr - if substr in self.vocab: - # 만일 해당 단어가 vocab에 있으면 해당 단어로 break - cur_substr = substr - break - end -= 1 - # 만일 어떠한 단어랑도 매칭되지 않았다면 (1)로 가서 [UNK] 처리 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - # 어미, 혹은 다른 사전에 있는 단어를 찾기 위해 start에 end값 할당 - start = end - if is_bad: # --- (1) - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -if __name__ == '__main__': - - file_path = "E:/KorBERT/1_bert_download_001_bert_morp_pytorch/001_bert_morp_pytorch" - vocab_file = file_path + '/vocab.korean_morp.list' - B = BertTokenizer(vocab_file=vocab_file, max_len=100000) - print(B.unk_token) - print(B.all_special_tokens) - print(B.max_len) - print(B.vocab[B.unk_token]) - print(B._convert_token_to_id('다/EF_')) - print(B.cls_token_id, B.sep_token_id) - print(B.cls_token) - print(B.vocab['모란/NNG_']) diff --git a/torch_bert/tokenization_morp.py b/torch_bert/tokenization_morp.py deleted file mode 100644 index 26b9f3c..0000000 --- a/torch_bert/tokenization_morp.py +++ /dev/null @@ -1,391 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -# 형태소분석 기반 BERT를 위한 Tokenization Class -# 수정: joonho.lim -# 일자: 2019-05-23 -# -"""Tokenization classes.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import collections -import unicodedata -import os -import logging - -from .file_utils import cached_path - -logger = logging.getLogger(__name__) - -PRETRAINED_VOCAB_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", -} -PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { - 'bert-base-uncased': 512, - 'bert-large-uncased': 512, - 'bert-base-cased': 512, - 'bert-large-cased': 512, - 'bert-base-multilingual-uncased': 512, - 'bert-base-multilingual-cased': 512, - 'bert-base-chinese': 512, -} -VOCAB_NAME = 'vocab.txt' - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - index = 0 - with open(vocab_file, "r", encoding="utf-8") as reader: - while True: - token = reader.readline() - if not token: - break - - ### joonho.lim @ 2019-03-15 - if token.find('n_iters=') == 0 or token.find('max_length=') == 0 : - continue - token = token.split('\t')[0] - - token = token.strip() - vocab[token] = index - index += 1 - return vocab - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a peice of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class BertTokenizer(object): - """Runs end-to-end tokenization: punctuation splitting + wordpiece""" - - def __init__(self, vocab_file, do_lower_case=True, max_len=None, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict( - [(ids, tok) for tok, ids in self.vocab.items()]) - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, - never_split=never_split) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - self.max_len = max_len if max_len is not None else int(1e12) - - def tokenize(self, text): - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - ### joonho.lim @ 2019-03-15 - token += '_' - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - return split_tokens - - def convert_tokens_to_ids(self, tokens): - """Converts a sequence of tokens into ids using the vocab.""" - ids = [] - for token in tokens: - ids.append(self.vocab[token]) - if len(ids) > self.max_len: - raise ValueError( - "Token indices sequence length is longer than the specified maximum " - " sequence length for this BERT model ({} > {}). Running this" - " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) - ) - return ids - - def convert_ids_to_tokens(self, ids): - """Converts a sequence of ids in wordpiece tokens using the vocab.""" - tokens = [] - for i in ids: - tokens.append(self.ids_to_tokens[i]) - return tokens - - @classmethod - def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): - """ - Instantiate a PreTrainedBertModel from a pre-trained model file. - Download and cache the pre-trained model file if needed. - """ - if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: - vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] - else: - vocab_file = pretrained_model_name - if os.path.isdir(vocab_file): - vocab_file = os.path.join(vocab_file, VOCAB_NAME) - # redirect to the cache, if necessary - try: - resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) - except FileNotFoundError: - logger.error( - "Model name '{}' was not found in model name list ({}). " - "We assumed '{}' was a path or url but couldn't find any file " - "associated to this path or url.".format( - pretrained_model_name, - ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), - vocab_file)) - return None - if resolved_vocab_file == vocab_file: - logger.info("loading vocabulary file {}".format(vocab_file)) - else: - logger.info("loading vocabulary file {} from cache at {}".format( - vocab_file, resolved_vocab_file)) - if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: - # if we're using a pretrained model, ensure the tokenizer wont index sequences longer - # than the number of positional embeddings - max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name] - kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) - # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) - return tokenizer - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, - do_lower_case=True, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - """Constructs a BasicTokenizer. - - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case - self.never_split = never_split - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = self._clean_text(text) - ### joonho.lim @ 2019-03-15 - # # # This was added on November 1st, 2018 for the multilingual and Chinese - # # # models. This is also applied to the English models now, but it doesn't - # # # matter since the English models were not trained on any Chinese data - # # # and generally don't have any Chinese data in them (there are Chinese - # # # characters in the vocabulary because Wikipedia does have some Chinese - # # # words in the English Wikipedia.). - # # text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case and token not in self.never_split: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - if text in self.never_split: - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenization.""" - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer`. - - Returns: - A list of wordpiece tokens. - """ - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - ### joonho.lim @ 2019-03-15 - # if start > 0: - # substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False - - -def _is_punctuation(char): - ### joonho.lim @ 2019-03-15 - return char == ' ' - - # """Checks whether `chars` is a punctuation character.""" - # cp = ord(char) - # # We treat all non-letter/number ASCII as punctuation. - # # Characters such as "^", "$", and "`" are not in the Unicode - # # Punctuation class but we treat them as punctuation anyways, for - # # consistency. - # if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - # (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - # return True - # cat = unicodedata.category(char) - # if cat.startswith("P"): - # return True - # return False diff --git a/torch_bert/tokenization_utils.py b/torch_bert/tokenization_utils.py deleted file mode 100644 index 7922076..0000000 --- a/torch_bert/tokenization_utils.py +++ /dev/null @@ -1,363 +0,0 @@ -# ref: https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_utils.py -from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union -import logging -from file_utils import cached_path - -logger = logging.getLogger(__name__) - -class SpecialTokenMixin: - - """Token에 관련된 행동들을 Handling""" - - SPECIAL_TOKENS_ATTRIBUTES = [ - "bos_token", - "eos_token", - "unk_token", - "sep_token", - "pad_token", - "cls_token", - "mask_token", - "additional_special_tokens", - ] - - def __init__(self, **kwargs): - self._bos_token = None - self._eos_token = None - self._unk_token = None - self._sep_token = None - self._pad_token = None - self._cls_token = None - self._mask_token = None - self._pad_token_type_id = 0 - self._additional_special_tokens = [] - - for key, value in kwargs.items(): - if key in self.SPECIAL_TOKENS_ATTRIBUTES: - if key == "additional_special_tokens": - assert isinstance(value, (list, tuple)) and \ - all(isinstance(t, str) for t in value) - # elif isinstance(value, AddedTokenFast): - # setattr(self, key, str(value)) - elif isinstance(value, str): - setattr(self, key, value) - else: - raise TypeError( - "special token {} has to be either str or AddedTokenFast but got: {}".format(key, type(value)) - ) - - @property - def bos_token(self): - """ Beginning of sentence token (string). Log an error if used while not having been set. """ - if self._bos_token is None: - logger.error("Using bos_token, but it is not set yet.") - return self._bos_token - - @property - def eos_token(self): - """ End of sentence token (string). Log an error if used while not having been set. """ - if self._eos_token is None: - logger.error("Using eos_token, but it is not set yet.") - return self._eos_token - - @property - def unk_token(self): - """ Unknown token (string). Log an error if used while not having been set. """ - if self._unk_token is None: - logger.error("Using unk_token, but it is not set yet.") - return self._unk_token - - @property - def sep_token(self): - """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ - if self._sep_token is None: - logger.error("Using sep_token, but it is not set yet.") - return self._sep_token - - @property - def pad_token(self): - """ Padding token (string). Log an error if used while not having been set. """ - if self._pad_token is None: - logger.error("Using pad_token, but it is not set yet.") - return self._pad_token - - @property - def cls_token(self): - """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ - if self._cls_token is None: - logger.error("Using cls_token, but it is not set yet.") - return self._cls_token - - @property - def mask_token(self): - """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ - if self._mask_token is None: - logger.error("Using mask_token, but it is not set yet.") - return self._mask_token - - @property - def additional_special_tokens(self): - """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ - if self._additional_special_tokens is None: - logger.error("Using additional_special_tokens, but it is not set yet.") - return self._additional_special_tokens - - def _maybe_update_backend(self, value): - """ To be overriden by derived class if a backend tokenizer has to be updated. """ - pass - - @bos_token.setter - def bos_token(self, value): - self._bos_token = value - self._maybe_update_backend([value]) - - @eos_token.setter - def eos_token(self, value): - self._eos_token = value - self._maybe_update_backend([value]) - - @unk_token.setter - def unk_token(self, value): - self._unk_token = value - self._maybe_update_backend([value]) - - @sep_token.setter - def sep_token(self, value): - self._sep_token = value - self._maybe_update_backend([value]) - - @pad_token.setter - def pad_token(self, value): - self._pad_token = value - self._maybe_update_backend([value]) - - @cls_token.setter - def cls_token(self, value): - self._cls_token = value - self._maybe_update_backend([value]) - - @mask_token.setter - def mask_token(self, value): - self._mask_token = value - self._maybe_update_backend([value]) - - @additional_special_tokens.setter - def additional_special_tokens(self, value): - self._additional_special_tokens = value - self._maybe_update_backend(value) - - @property - def bos_token_id(self): - """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.bos_token) - - @property - def eos_token_id(self): - """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.eos_token) - - @property - def unk_token_id(self): - """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.unk_token) - - @property - def sep_token_id(self): - """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.sep_token) - - @property - def pad_token_id(self): - """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.pad_token) - - @property - def pad_token_type_id(self): - """ Id of the padding token type in the vocabulary.""" - return self._pad_token_type_id - - @property - def cls_token_id(self): - """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.cls_token) - - @property - def mask_token_id(self): - """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.mask_token) - - @property - def additional_special_tokens_ids(self): - """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ - return self.convert_tokens_to_ids(self.additional_special_tokens) - - @property - def special_tokens_map(self): - """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their - values ('', ''...) - """ - set_attr = {} - for attr in self.SPECIAL_TOKENS_ATTRIBUTES: - attr_value = getattr(self, "_" + attr) - if attr_value: - set_attr[attr] = attr_value - return set_attr - - @property - def all_special_tokens(self): - """ List all the special tokens ('', ''...) mapped to class attributes - (cls_token, unk_token...). - """ - all_toks = [] - set_attr = self.special_tokens_map - print(set_attr) - for attr_value in set_attr.values(): - all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) - all_toks = list(set(all_toks)) - return all_toks - - @property - def all_special_ids(self): - """ List the vocabulary indices of the special tokens ('', ''...) mapped to - class attributes (cls_token, unk_token...). - """ - all_toks = self.all_special_tokens - all_ids = self.convert_tokens_to_ids(all_toks) - return all_ids - -class PretrainedTokenizer(SpecialTokenMixin): - - vocab_files_names: Dict[str, str] = {} - pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} - pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} - max_model_input_sizes: Dict[str, int] = {} - model_input_names: List[str] = ["token_type_ids", "attention_mask"] - - padding_side: str = "right" - - NO_PAD_TOKEN_FOR_BATCH_MSG = ( - "No padding token is set for this model, therefore no batch can be made with uneven " - "sequences. Set a padding token or adjust the lengths of the sequences building the " - "batch so that every sequence is of the same length." - ) - - UNEVEN_SEQUENCES_FOR_BATCH_MSG = ( - "The sequences building the batch are not of the same size, no tensor " - "can be built. Set `pad_to_max_length=True` to pad the smaller sequences" - "up to the larger sequence's length." - ) - - def __init__(self, model_max_length=None, **kwargs): - super(PretrainedTokenizer, self).__init__(**kwargs) - - # For backward compatibility we fallback to set model_max_length from max_len if provided - model_max_length = model_max_length if model_max_length is not None else kwargs.pop("max_len", None) - self.model_max_length = model_max_length if model_max_length is not None else int(1e30) - - # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed. - self.padding_side = kwargs.pop("padding_side", self.padding_side) - assert self.padding_side in [ - "right", - "left", - ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" - self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) - - # Added tokens - self.added_tokens_encoder = {} - self.unique_added_tokens_encoder = set() - self.added_tokens_decoder = {} - - # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) - self.init_inputs = () - self.init_kwargs = {} - - def __len__(self): - return self.vocab_size + len(self.added_tokens_encoder) - - @property - def vocab_size(self): - raise NotImplementedError - - @property - def max_len(self): - return self.model_max_length - - @property - def max_len_single_sentence(self): - return self.model_max_length - self.num_special_tokens_to_add(pair=False) - - @property - def max_len_sentences_pair(self): - return self.model_max_length - self.num_special_tokens_to_add(pair=True) - - @classmethod - def from_pretrained(cls, *input, **kwargs): - return cls._from_pretrained(*input, **kwargs) - - def _convert_token_to_id(self, token): - return self.vocab.get(token, self.vocab.get(self.unk_token)) - - def _convert_id_to_token(self, index): - return self.ids_to_tokens.get(index, self.unk_token) - - def convert_tokens_to_string(self, tokens): - out_string = " ".join(tokens).replace(" ##", "").strip() - return out_string - - def convert_tokens_to_ids(self, tokens): - if isinstance(tokens, str): - return self._convert_token_to_id(tokens) - ids = [self._convert_token_to_id(token) for token in tokens] - if len(ids) > self.max_len: - raise ValueError( - "Token indices sequence length is longer than the specified maximum " - " sequence length for this BERT model ({} > {}). Running this" - " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) - ) - return ids - - def convert_ids_to_tokens(self, ids): - tokens = [self._convert_id_to_token(i) for i in ids] - return tokens - - # ETRI 코드 - @classmethod - def _from_pretrained(cls, pretrained_model_name, cache_dir=None, *init_inputs, **kwargs): - """ - Instantiate a PreTrainedBertModel from a pre-trained model file. - Download and cache the pre-trained model file if needed. - """ - if pretrained_model_name in cls.pretrained_vocab_files_map: - vocab_file = cls.pretrained_vocab_files_map[pretrained_model_name] - else: - vocab_file = pretrained_model_name - if os.path.isdir(vocab_file): - vocab_file = os.path.join(vocab_file, self.vocab_file_names) - # redirect to the cache, if necessary - try: - resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) - except FileNotFoundError: - logger.error( - "Model name '{}' was not found in model name list ({}). " - "We assumed '{}' was a path or url but couldn't find any file " - "associated to this path or url.".format( - pretrained_model_name, - ', '.join(cls.pretrained_vocab_files_map.keys()), - vocab_file)) - return None - if resolved_vocab_file == vocab_file: - logger.info("loading vocabulary file {}".format(vocab_file)) - else: - logger.info("loading vocabulary file {} from cache at {}".format( - vocab_file, resolved_vocab_file)) - if pretrained_model_name in self.max_model_input_sizes: - # if we're using a pretrained model, ensure the tokenizer wont index sequences longer - # than the number of positional embeddings - max_len = self.max_model_input_sizes[pretrained_model_name] - kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) - # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) - return tokenizer - -if __name__ == '__main__': - t = SpecialTokenMixin(bos_token='[UNK]') - print(t.bos_token) diff --git a/tutorials/README.md b/tutorials/README.md new file mode 100644 index 0000000..e69de29 diff --git a/understand_wordpiece_tokenizing.ipynb b/understand_wordpiece_tokenizing.ipynb deleted file mode 100644 index 54f71ab..0000000 --- a/understand_wordpiece_tokenizing.ipynb +++ /dev/null @@ -1,579 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "BERT_MODEL_HUB\t https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\n", - "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" - ] - } - ], - "source": [ - "import tensorflow as tf\n", - "import tensorflow_hub as hub\n", - "\n", - "BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:\"string\"}\n", - "BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'\n", - "\n", - "print('BERT_MODEL_HUB\\t', BERT_MODEL_HUB)\n", - "\n", - "# Vocab_file을 저장하고 directory 주소를 binary 형태로 얻는다.\n", - "with tf.Graph().as_default():\n", - " bert_module = hub.Module(BERT_MODEL_HUB)\n", - " tokenization_info = bert_module(signature='tokenization_info',\n", - " as_dict=True)\n", - " with tf.Session() as sess:\n", - " vocab_file, do_lower_case = sess.run(\n", - " [tokenization_info['vocab_file'],\n", - " tokenization_info['do_lower_case']])" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "b'C:\\\\Users\\\\jinma\\\\AppData\\\\Local\\\\Temp\\\\tfhub_modules\\\\5a395eafef2a37bd9fc55d7f6ae676d2a134a838\\\\assets\\\\vocab.txt'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vocab_file" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import collections\n", - "\n", - "# 단어 사전을 저장할 Ordereddict 객체 생성\n", - "vocab = collections.OrderedDict()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Binary text를 unicode(utf-8)로 decode하는 함수 작성\n", - "def convert_to_unicode(text):\n", - " if isinstance(text, str):\n", - " return text\n", - " elif isinstance(text, bytes):\n", - " return text.decode('utf-8', 'ignore')\n", - " else:\n", - " raise ValueError('Unsupported string type: %s' % type(text))" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# vocab_file의 각 text를 unicode로 변환, vocab에 기록\n", - "index = 0\n", - "with tf.gfile.GFile(vocab_file, 'r') as reader:\n", - " while True:\n", - " token = convert_to_unicode(reader.readline())\n", - " if not token:\n", - " break\n", - " token = token.strip()\n", - " vocab[token] = index\n", - " index += 1" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2023, 19204)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "vocab.get('this'), vocab.get('token')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "30522" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(vocab)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['[PAD]',\n", - " '\"',\n", - " 'to',\n", - " 'paris',\n", - " 'tears',\n", - " 'knight',\n", - " 'peninsula',\n", - " 'licensed',\n", - " 'mouse',\n", - " 'screenplay',\n", - " 'raven',\n", - " 'tonnes',\n", - " 'princes',\n", - " 'osaka',\n", - " 'liability',\n", - " '##lip',\n", - " 'kappa',\n", - " 'hasan',\n", - " 'belts',\n", - " '##leader',\n", - " 'chunk',\n", - " 'colton',\n", - " 'artworks',\n", - " 'radiated',\n", - " 'plank',\n", - " 'fielder',\n", - " 'fide',\n", - " 'selector',\n", - " 'statehood',\n", - " 'gunners',\n", - " '##ᄌ']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(vocab.keys())[::1000]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# vocab의 key와 value를 바꾼 dict 객체 생성\n", - "inv_vocab = {v:k for k, v in vocab.items()}" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "do_lower_case = True" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Char 단위 함수 작성\n", - "import unicodedata\n", - "\n", - "def _is_whitespace(char):\n", - " if char == \" \" or char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n", - " # 공백 혹은 개행문자일 경우 True 반환\n", - " return True\n", - " cat = unicodedata.category(char)\n", - " if cat == 'Zs':\n", - " # unicode category가 \"Space Separator\"일 경우 True 반환\n", - " return True\n", - " return False\n", - " \n", - "def _is_control(char):\n", - " if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n", - " # 개행문자일 경우 False 반환\n", - " return False\n", - " cat = unicodedata.category(char)\n", - " if cat in ('Cc', 'Cf'):\n", - " # unicode category가 \"Control\", 혹은 \"Format\"일 경우 True 반환\n", - " return True\n", - " return False\n", - "\n", - "def _is_punctuation(char):\n", - " cp = ord(char)\n", - " if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or\n", - " (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):\n", - " return True\n", - " cat = unicodedata.category(char)\n", - " if cat.startswith(\"P\"):\n", - " # unicode category가 P로 시작할 경우 True 반환\n", - " # Pc (Connector Punctuatoin)\n", - " # Pd (Dash Punctuation)\n", - " # Pe (Close Punctuation)\n", - " # Pf (Final Punctuatoin)\n", - " # Pi (Initial Punctuation)\n", - " # Po (Other Punctuation)\n", - " # Ps (Open Punctuation)\n", - " return True\n", - " return False" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('\\x00', '�')" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chr(0), chr(0xfffd)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"\\n This \\t here's \\t an example of using the BERT tokenizer\"" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 예제 text를 할당한다.\n", - "text = \"\\n This \\t here's \\t an example of using the BERT tokenizer\"\n", - "text" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1. BasicTokenizer로 tokenize\n", - "\n", - "Origin Text : This here's an example of using the BERT tokenizer\n", - "Cleaned Text : This here's an example of using the BERT tokenizer\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : this\n", - "\tnormalize Token : this\n", - "\t output Token : this\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['t', 'h', 'i', 's']\n", - "\tEnd output : [['t', 'h', 'i', 's']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : here's\n", - "\tnormalize Token : here's\n", - "\t output Token : here's\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['h', 'e', 'r', 'e', \"'\", 's']\n", - "\tEnd output : [['h', 'e', 'r', 'e'], [\"'\"], ['s']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : an\n", - "\tnormalize Token : an\n", - "\t output Token : an\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['a', 'n']\n", - "\tEnd output : [['a', 'n']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : example\n", - "\tnormalize Token : example\n", - "\t output Token : example\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['e', 'x', 'a', 'm', 'p', 'l', 'e']\n", - "\tEnd output : [['e', 'x', 'a', 'm', 'p', 'l', 'e']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : of\n", - "\tnormalize Token : of\n", - "\t output Token : of\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['o', 'f']\n", - "\tEnd output : [['o', 'f']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : using\n", - "\tnormalize Token : using\n", - "\t output Token : using\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['u', 's', 'i', 'n', 'g']\n", - "\tEnd output : [['u', 's', 'i', 'n', 'g']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : the\n", - "\tnormalize Token : the\n", - "\t output Token : the\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['t', 'h', 'e']\n", - "\tEnd output : [['t', 'h', 'e']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : bert\n", - "\tnormalize Token : bert\n", - "\t output Token : bert\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['b', 'e', 'r', 't']\n", - "\tEnd output : [['b', 'e', 'r', 't']]\n", - " Do Lower Case... run strip accents.\n", - "\t origin Token : tokenizer\n", - "\tnormalize Token : tokenizer\n", - "\t output Token : tokenizer\n", - " \t\trun split on Punctuation.\n", - "\tStart on list(token) : ['t', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']\n", - "\tEnd output : [['t', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r']]\n", - "split_tokens : ['this', 'here', \"'\", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'tokenizer']\n", - "----------------------------------------------------------------------------------------------------\n", - "Final Result : ['this', 'here', \"'\", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'tokenizer']\n" - ] - } - ], - "source": [ - "print('1. BasicTokenizer로 tokenize\\n')\n", - "text = convert_to_unicode(text)\n", - "## _clean_text(self, text):\n", - "output = []\n", - "for char in text:\n", - " cp = ord(char)\n", - " if cp == 0 or cp == 0xfffd or _is_control(char):\n", - " continue\n", - " if _is_whitespace(char): # 공백 혹은 개행문자면 \n", - " output.append(\" \")\n", - " else:\n", - " output.append(char)\n", - "print('Origin Text :', text)\n", - "text = \"\".join(output)\n", - "print('Cleaned Text :', text)\n", - "\n", - "## whitespace_tokenize(text)\n", - "text = text.strip()\n", - "orig_tokens = text.split()\n", - "split_tokens = []\n", - "for token in orig_tokens:\n", - " if do_lower_case:\n", - " print(' Do Lower Case... run strip accents.')\n", - " token = token.lower()\n", - " ## _run_strip_accents(self, text)\n", - " print('\\t origin Token :', token)\n", - " token = unicodedata.normalize(\"NFD\", token)\n", - " print('\\tnormalize Token :', token)\n", - " output = []\n", - " for char in token:\n", - " cat = unicodedata.category(char)\n", - " if cat == 'Mn':\n", - " # unicode category가 \"Nonspacing Mark\"일 경우 pass\n", - " continue\n", - " output.append(char)\n", - " token = \"\".join(output)\n", - " print('\\t output Token :', token)\n", - " ## _run_split_on_punc(self, text)\n", - " print(' \\t\\trun split on Punctuation.')\n", - " chars = list(token)\n", - " i, start_new_word, output = 0, True, []\n", - " print('\\tStart on list(token) :', chars)\n", - " while i < len(chars):\n", - " char = chars[i]\n", - " if _is_punctuation(char):\n", - " output.append([char])\n", - " start_new_word = True\n", - " else:\n", - " if start_new_word:\n", - " output.append([])\n", - " start_new_word = False\n", - " output[-1].append(char)\n", - " i += 1\n", - " print('\\tEnd output :', output)\n", - " split_tokens.extend([\"\".join(x) for x in output])\n", - "print('split_tokens :', split_tokens)\n", - "t = \" \".join(split_tokens)\n", - "t = t.strip()\n", - "output_tokens = t.split()\n", - "print('-' * 100 + '\\n' + 'Final Result :', output_tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2. WordpieceTokenizer로 tokenize\n", - "\n", - "token : this\n", - "0 4\t this\n", - "token : here\n", - "0 4\t here\n", - "token : '\n", - "0 1\t '\n", - "token : s\n", - "0 1\t s\n", - "token : an\n", - "0 2\t an\n", - "token : example\n", - "0 7\t example\n", - "token : of\n", - "0 2\t of\n", - "token : using\n", - "0 5\t using\n", - "token : the\n", - "0 3\t the\n", - "token : bert\n", - "0 4\t bert\n", - "token : tokenizer\n", - "0 9\t tokenizer\n", - "\t tokenize\n", - "\t tokeniz\n", - "\t tokeni\n", - "\t token\n", - "5 9\t izer\n", - "----------------------------------------------------------------------------------------------------\n", - "Final Result : ['this', 'here', \"'\", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'token', '##izer']\n" - ] - } - ], - "source": [ - "print('2. WordpieceTokenizer로 tokenize\\n')\n", - "\n", - "split_tokens = []\n", - "for token in output_tokens:\n", - " print('token :', token)\n", - " ## wordpiece tokenizing (greedy longest-match-first algorithm)\n", - " unk_token = \"[UNK]\"\n", - " max_input_chars_per_word = 200\n", - " # Start\n", - " token = convert_to_unicode(token)\n", - " output_tokens_ = []\n", - " ## whitspacing\n", - " if not token.strip():\n", - " tokens = []\n", - " else:\n", - " tokens = token.strip().split()\n", - " for token in tokens:\n", - " chars = list(token)\n", - " if len(chars) > max_input_chars_per_word:\n", - " # 200글자를 넘을 경우 UNK 처리\n", - " output_tokens_.append(unk_token)\n", - " continue\n", - " \n", - " is_bad = False\n", - " start = 0\n", - " sub_tokens = []\n", - " while start < len(chars):\n", - " end = len(chars)\n", - " print(start, end, end='')\n", - " cur_substr = None\n", - " # 첫번째 글짜부터 천천히 vocab에 있는 단어인지 체크\n", - " while start < end:\n", - " substr = \"\".join(chars[start:end])\n", - " print('\\t', substr)\n", - " if start > 0:\n", - " ## start에 end가 할당됐을 경우,\n", - " ## 이는 어미이므로 ##을 붙여서 vocab에 있는지 체크\n", - " substr = \"##\" + substr\n", - " if substr in vocab:\n", - " cur_substr = substr\n", - " break\n", - " end -= 1\n", - " # 만일 못찾았을 경우, [UNK]으로 처리\n", - " if cur_substr is None:\n", - " is_bad = True\n", - " break\n", - " sub_tokens.append(cur_substr)\n", - " # 어미를 추가하기 위해 start에 end값을 할당\n", - " start = end\n", - " if is_bad:\n", - " output_tokens_.append(unk_token)\n", - " else:\n", - " output_tokens_.extend(sub_tokens)\n", - " for sub_token in sub_tokens:\n", - " split_tokens.append(sub_token)\n", - " \n", - "print('-' * 100 + '\\n' + 'Final Result :', split_tokens)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "basic", - "language": "python", - "name": "basic" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git "a/\355\230\225\355\203\234\354\206\214 \353\266\204\354\204\235+\355\206\240\355\201\260\355\231\224+PositionalEmbedding.ipynb" "b/\355\230\225\355\203\234\354\206\214 \353\266\204\354\204\235+\355\206\240\355\201\260\355\231\224+PositionalEmbedding.ipynb" deleted file mode 100644 index cc24b41..0000000 --- "a/\355\230\225\355\203\234\354\206\214 \353\266\204\354\204\235+\355\206\240\355\201\260\355\231\224+PositionalEmbedding.ipynb" +++ /dev/null @@ -1,1597 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# BERT를 위한 형태소 분석\n", - "- google research의 tensorflow 버전(원본) bert\n", - "- ETRI에서 제공한 pre-trained 모델과 한국어 단어 사전 사용\n", - "- ETRI에서 제공하는 버전은 총 4개\n", - " - Pytorch + Morphology\n", - " - Tensorflow + Morphology\n", - " - Pytorch + Eojeol\n", - " - Tensorflow + Eojeol\n", - "- 이 중 `Pytorch`제외, 형태소 분석이 된 text를 input으로 받는 2번 선택\n", - "- `Morphology`는 input text에 **TTA 표준 형태소 태그셋(TTAK.KO-11.0010/R1)**에 맞는 **형태소 분석기**를 사용해야 함.\n", - "- TTA 표준 형태소 태그셋에 맞게 분석하는 형태소 분석기는 다음과 같음\n", - " - `Mecab`\n", - " - `심사숙고/NNG + 했/XSV+EP + 겠/EP + 지만/EC`와 같이 분석하는 경우가 있음\n", - " - Input이 `XSV+EP`으로 나오면 안됨. + 제거 후 `[하/동사파생접미사(XSV), 였/선어말어미(EP)]`로 분석해야함\n", - " - `ETRI 형태소 분석기`\n", - " - Web API에 접속하여 사용\n", - " - 일일 한도 제한있음\n", - " - `Khaiii`\n", - " - Kakao Hangul Analyzer III\n", - " - 속도도 빠르고 만족스러운 성능을 보임\n", - " - 윈도우 지원 안함\n", - " - `심사숙고/NNG + 하/XSV + 였/EP + 겠/EP + 지만/EC`와 같이 아주 잘 분석함\n", - "- 아래 표는 `TTAK.KO-11.0010/R1`\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
대분류\n", - " 중분류\n", - " 대분류\n", - "
(1) 체언명사일반명사(NNG)
고유명사(NNP)
의존명사(NNB)
대명사(NP)대명사(NP)
수사(NR)수사(NR)
(2) 용언동사(VV)동사(VV)
형용사(VA)형용사(VA)
보조용언(VX)보조용언(VX)
지정사(VC)긍정지정사(VCP)
부정지정사(VCN)
(3) 수식언관형사(MM)성상 관형사(MMA)
지시 관형사(MMD)
수 관형사(MMN)
부사(MA)일반부사(MAG)
접속부사(MAJ)
(4) 독립언감탄사(IC)감탄사(IC)
(5) 관계언격조사(JK)주격조사(JKS)
보격조사(JKC)
관형격조사(JKG)
목적격조사(JKO)
부사격조사(JKB)
호격조사(JKV)
인용격조사(JKQ)
보조사(JX)보조사(JX)
접속조사(JC)접속조사(JC)
(6) 의존형태어미(EM)선어말어미(EP)
종결어미(EF)
연결어미(EC)
명사형전성어미(ETN)
관형형전성어미(ETM)
접두사(XP)체언접두사(XPN)
접미사(XS)명사파생접미사(XSN)
동사파생접미사(XSV)
형용사파생접미사(XSA)
어근(XR)어근(XR)
(7) 기초일반기호(ST)마침표, 물음표, 느낌표(SF)
쉼표, 가운뎃점, 콜론, 빗금(SP)
따옴표, 괄호표, 줄표(SS)
줄임표(SE)
붙임표(물결)(SO)
기타 기호(SW)
외국어(SL)외국어(SL)
한자(SH)한자(SH)
숫자(SN)숫자(SN)
분석불능범주(NA)분석불능범주(NA)
" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## BERTTokenizer\n", - "- End 2 End Tokenizer\n", - "- `BasicTokenizer`와 `WordpieceTokenizer`를 연결하여 한번에 Tokenizing시킨다\n", - "- input은 위에서 언급했듯이 TTAK 기준으로 형태소 분석된 텍스트를 넣어줘야하며\n", - "- 이를 Token화 시키는 것이 해당 Tokenizer의 역할\n", - "- 사실상 하는 역할이 없다. 아래 어떻게 작동되는지 보면 안다." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## convert_single_example\n", - "- BERT 논문에 나오는 positional embedding을 실시\n", - "- 내부적으로 `BERTTokenizer`의 input으로 들어갈 수 있게 형태소 분석을 해주고\n", - "- Token화된 input을 positional embedding시켜 feature화시키고 이를 반환한다" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## file_based_convert_examples_to_features\n", - "- 들어오는 input을 `convert_single_example`함수에 넣어 positional embedding된 feature로 받고\n", - "- 이를 `tf_record`파일로 기록한다." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Gocha!\n", - "- 어떻게 Tokenizing하는지 들여다보자." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 사전 정의\n", - "- `BERTTokenizer`의 생성자를 보면 아래 항목을 argument로 받고\n", - " - vocab_file: 단어 사전이 저장된 file path\n", - " - do_lower_case: 원래는 소문자 변환을 할 것인지, 한국어에서는 정준분해를 할 것인지\n", - " - max_len: 최대 길이\n", - "- 아래 속성을 정의한다.\n", - " - 사전\n", - " - 역방향 사전\n", - " - `BasicTokenizer`\n", - " - `WordpieceTokenizer`\n", - " - 최대 길이\n", - "- 하나씩 보자." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# LIBRARIES\n", - "\n", - "import collections # OrderedDict를 위해 호출\n", - "import re # 정규표현식\n", - "import unicodedata # 한국어 정준분해 및 문자열 확인\n", - "import six # Python version 체크\n", - "import tensorflow as tf # Tensorflow 파일 불러오기 및 logging" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# ARGUMENTS\n", - "\n", - "# ETRI에서 받은 file path를 저장\n", - "path2 = '../KorBERT/2_bert_download_002_bert_morp_tensorflow/002_bert_morp_tensorflow/'\n", - "path4 = '../KorBERT/4_bert_download_004_bert_eojeol_tensorflow/004_bert_eojeol_tensorflow/'\n", - "# 한국어 vocab 사전을 등록\n", - "morph_vocab_file = path2 + 'vocab.korean_morp.list'\n", - "rawtext_vocab_file = path4 + 'vocab.korean.rawtext.list'\n", - "\n", - "do_lower_case = True # default=False, 정준분해 예시를 위해 True로 설정\n", - "max_len = None # 없으면 1e12" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# 단어 사전 호출\n", - "\n", - "def convert_to_unicode(text):\n", - " # Python version이 3.x일 때,\n", - " # type(text)이 `bytes`일 경우, utf-8로 변환\n", - " if six.PY3:\n", - " if isinstance(text, str):\n", - " return text\n", - " elif isinstance(text, bytes):\n", - " return text.decode(\"utf-8\", \"ignore\")\n", - " else:\n", - " raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n", - " # Python version이 2.x일 때,\n", - " # type(text)이 `str`일 경우, utf-8로 변환\n", - " elif six.PY2:\n", - " if isinstance(text, str):\n", - " return text.decode(\"utf-8\", \"ignore\")\n", - " elif isinstance(text, unicode):\n", - " return text\n", - " else:\n", - " raise ValueError(\"Unsupported string type: %s\" % (type(text)))\n", - " # Python 3.x, 2.x만 허용!\n", - " else:\n", - " raise ValueError(\"Not running on Python2 or Python 3?\")\n", - " \n", - " \n", - "def _load_vocab(vocab_file):\n", - " # 단어 사전을 저장할 OrderedDict 객체 생성\n", - " vocab = collections.OrderedDict()\n", - " index = 0\n", - " with tf.io.gfile.GFile(vocab_file, 'r') as reader:\n", - " while True:\n", - " # Binary Text를 unicode(utf-8)로 decode.\n", - " token = convert_to_unicode(reader.readline())\n", - " if not token: break\n", - " if ((token.find('n_iters=') == 0) or\n", - " (token.find('max_length=') == 0)):\n", - " continue\n", - " token = token.split('\\t')[0]\n", - " token = token.strip()\n", - " # 토큰과 해당 index를 기록\n", - " vocab[token] = index\n", - " index += 1\n", - " return vocab\n", - "\n", - "# 단어 사전 호출\n", - "morph_vocab = _load_vocab(morph_vocab_file)\n", - "rawtext_vocab = _load_vocab(rawtext_vocab_file)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'[PAD]': 0,\n", - " '[UNK]': 1,\n", - " '[CLS]': 2,\n", - " '[SEP]': 3,\n", - " '[MASK]': 4,\n", - " '': 5,\n", - " '': 6,\n", - " './SF_': 7,\n", - " '다/EF_': 8,\n", - " '하/XSV_': 9,\n", - " 'ㄴ/ETM_': 10,\n", - " '을/JKO_': 11,\n", - " '었/EP_': 12,\n", - " '의/JKG_': 13,\n", - " '에/JKB_': 14,\n", - " '이/VCP_': 15,\n", - " '이/JKS_': 16,\n", - " ',/SP_': 17,\n", - " '는/JX_': 18,\n", - " '를/JKO_': 19,\n", - " '어/EC_': 20,\n", - " '은/JX_': 21,\n", - " '는/ETM_': 22,\n", - " '고/EC_': 23,\n", - " '가/JKS_': 24,\n", - " '\"/SS_': 25,\n", - " \"'/SS_\": 26,\n", - " '에서/JKB_': 27,\n", - " '으로/JKB_': 28,\n", - " '(/SS_': 29,\n", - " ')/SS_': 30,\n", - " '로/JKB_': 31,\n", - " '되/XSV_': 32,\n", - " '것/NNB_': 33,\n", - " '도/JX_': 34,\n", - " 'ㄹ/ETM_': 35,\n", - " '들/XSN_': 36,\n", - " '있/VX_': 37,\n", - " '있/VA_': 38,\n", - " '년/NNB_': 39,\n", - " '하/VV_': 40,\n", - " 'ㄴ다/EF_': 41,\n", - " '하/XSA_': 42,\n", - " '았/EP_': 43,\n", - " '일/NNB_': 44,\n", - " '은/ETM_': 45,\n", - " '과/JC_': 46,\n", - " '게/EC_': 47,\n", - " '지/EC_': 48,\n", - " '기/ETN_': 49,\n", - " '1/SN_': 50,\n", - " '등/NNB_': 51,\n", - " '자/XSN_': 52,\n", - " '며/EC_': 53,\n", - " '2/SN_': 54,\n", - " '수/NNB_': 55,\n", - " '와/JC_': 56,\n", - " '되/VV_': 57,\n", - " '적/XSN_': 58,\n", - " '않/VX_': 59,\n", - " '월/NNB_': 60,\n", - " '하/VX_': 61,\n", - " '아/EC_': 62,\n", - " '3/SN_': 63,\n", - " '고/JKQ_': 64,\n", - " '‘/SS_': 65,\n", - " '’/SS_': 66,\n", - " '“/SS_': 67,\n", - " '던/ETM_': 68,\n", - " '”/SS_': 69,\n", - " '없/VA_': 70,\n", - " '면/EC_': 71,\n", - " '말/NNG_': 72,\n", - " '대하/VV_': 73,\n", - " '지만/EC_': 74,\n", - " '·/SP_': 75,\n", - " '에게/JKB_': 76,\n", - " '이/NP_': 77,\n", - " '받/VV_': 78,\n", - " '까지/JX_': 79,\n", - " '이/MM_': 80,\n", - " '%/SW_': 81,\n", - " '4/SN_': 82,\n", - " '/NNG_': 83,\n", - " '과/JKB_': 84,\n", - " '만/NR_': 85,\n", - " '원/NNB_': 86,\n", - " '명/NNB_': 87,\n", - " '면서/EC_': 88,\n", - " '다는/ETM_': 89,\n", - " '그/NP_': 90,\n", - " '5/SN_': 91,\n", - " '한/MM_': 92,\n", - " '을/ETM_': 93,\n", - " '어서/EC_': 94,\n", - " '-/SS_': 95,\n", - " '다고/EC_': 96,\n", - " '위하/VV_': 97,\n", - " '만/JX_': 98,\n", - " '중/NNB_': 99}" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 100개만 출력해보기\n", - "# 뒤의 품사에 `_`가 붙은 것을 잘 기억해두기\n", - "{key:value for i, (key, value) in enumerate(morph_vocab.items()) if i < 100}" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'[PAD]': 0,\n", - " '[UNK]': 1,\n", - " '[CLS]': 2,\n", - " '[SEP]': 3,\n", - " '[MASK]': 4,\n", - " '': 5,\n", - " '': 6,\n", - " '._': 7,\n", - " ',_': 8,\n", - " '_': 9,\n", - " '이_': 10,\n", - " '의_': 11,\n", - " '을_': 12,\n", - " '에_': 13,\n", - " '\"': 14,\n", - " '(': 15,\n", - " '한_': 16,\n", - " \"'\": 17,\n", - " '은_': 18,\n", - " ')': 19,\n", - " '이': 20,\n", - " '는_': 21,\n", - " '에서_': 22,\n", - " '고_': 23,\n", - " '했다': 24,\n", - " '가_': 25,\n", - " '로_': 26,\n", - " '지': 27,\n", - " '있다': 28,\n", - " '도_': 29,\n", - " '과_': 30,\n", - " '으로_': 31,\n", - " '를_': 32,\n", - " '다': 33,\n", - " '하는_': 34,\n", - " '사': 35,\n", - " '시': 36,\n", - " '기': 37,\n", - " '대': 38,\n", - " '고': 39,\n", - " '수': 40,\n", - " '가': 41,\n", - " '.': 42,\n", - " '수_': 43,\n", - " '전': 44,\n", - " '주': 45,\n", - " '일': 46,\n", - " '리': 47,\n", - " '자': 48,\n", - " '정': 49,\n", - " '할_': 50,\n", - " '인': 51,\n", - " '1': 52,\n", - " '아': 53,\n", - " '와_': 54,\n", - " '부': 55,\n", - " '스': 56,\n", - " '인_': 57,\n", - " '하고_': 58,\n", - " '해': 59,\n", - " '보': 60,\n", - " '유': 61,\n", - " '어': 62,\n", - " '이다': 63,\n", - " '상': 64,\n", - " '2': 65,\n", - " ')_': 66,\n", - " '신': 67,\n", - " '원': 68,\n", - " '무': 69,\n", - " '장': 70,\n", - " '3': 71,\n", - " '마': 72,\n", - " '비': 73,\n", - " '조': 74,\n", - " '동': 75,\n", - " '제': 76,\n", - " '로': 77,\n", - " '해_': 78,\n", - " '소': 79,\n", - " '성': 80,\n", - " '도': 81,\n", - " '지_': 82,\n", - " '세': 83,\n", - " '‘': 84,\n", - " '나': 85,\n", - " '오': 86,\n", - " '미': 87,\n", - " '“': 88,\n", - " '공': 89,\n", - " '하': 90,\n", - " '연': 91,\n", - " '있는_': 92,\n", - " '구': 93,\n", - " '라': 94,\n", - " '재': 95,\n", - " '한': 96,\n", - " '여': 97,\n", - " '5': 98,\n", - " '4': 99}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 100개만 출력해보기\n", - "# 뒤의 품사에 `_`가 붙은 것을 잘 기억해두기\n", - "{key:value for i, (key, value) in enumerate(rawtext_vocab.items()) if i < 100}" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(30797, 30349)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(rawtext_vocab), len(morph_vocab)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'vocab' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# 역방향 사전 정의\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;31m# 근데 결국 사용안하드라\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0minv_vocab\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[0mv\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mk\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mk\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mv\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mvocab\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mNameError\u001b[0m: name 'vocab' is not defined" - ] - } - ], - "source": [ - "# 역방향 사전 정의\n", - "# 근데 결국 사용안하드라\n", - "inv_vocab = {v: k for k, v in vocab.items()}" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1000000000000" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# max_len을 사용자가 넣지 않았을 경우\n", - "# 1000000000000을 상한으로 함\n", - "# 사실상 무한대지 뭐.\n", - "max_len = max_len if max_len is not None else int(1e12)\n", - "max_len" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'심사숙고/NNG 하/XSV 였/EP 겠/EP 지만/EC 참으로/MAG 유감/NNG 이/JX 야/EC'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Input Text\n", - "text = '심사숙고했겠지만 참으로 유감이야' # 예시 text 생성\n", - "# 형태소 분석을 아래와 같이 실시했다고 가정하자.\n", - "# 형태소 분석 API부분은 `convert_single_example`함수에서 다시 다룰게요.\n", - "text = '심사숙고/NNG + 하/XSV + 였/EP + 겠/EP + 지만/EC + 참으로/MAG + 유감/NNG + 이/JX + 야/EC'\n", - "# ETRI에서 정의한 대로 input을 만들어줍시다.\n", - "text = text.replace(' + ', ' ')\n", - "text" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `BERTTokenizer.tokenize()`\n", - "- 아래와 같이 동작한다.\n", - " ```python\n", - " def tokenize(self, text):\n", - " split_tokens = []\n", - " # End to End Tokenizing.\n", - " for token in self.basic_tokenizer.tokenize(text):\n", - " # ETRI Vocab 양식에 맞게 token 끝에 '_'를 붙여준다.\n", - " token += '_'\n", - " for sub_token in self.wordpiece_tokenizer.tokenize(token):\n", - " split_tokens.append(sub_token)\n", - " return split_tokens\n", - " ```\n", - "- 여기서 `BasicTokenizer`와 `WordpieceTokenizer`를 정의하지 않고 어떻게 동작하는지 그 흐름대로 살펴보겠다." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "SPLIT_TOKENS = [] # 최종적으로 return할 list 생성" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### `BasicTokenizer.tokenize()`\n", - "- 아래와 같이 동작한다.\n", - " ```python\n", - " def tokenize(self, text):\n", - " text = convert_to_unicode(text) #1\n", - " text = self._clean_text(text) #2\n", - "\n", - " orig_tokens = whitespace_tokenize(text) #3\n", - " split_tokens = []\n", - " for token in orig_tokens:\n", - " if self.do_lower_case:\n", - " # 현재 input으로 '고객/NNG'와 같이 Part-of-speech가 이미\n", - " # tagging되어있고 vocab은 '고객/NNG_'로 단어를 기록하고 있음.\n", - " # 여기서 `lower` 메서드를 사용하면 뒤의 tagging이 소문자로\n", - " # 변환되어 값의 비교를 못하게 되므로 이를 주석처리.\n", - "\n", - " # token.lower()\n", - "\n", - " # 모든 음절을 정준 분해시키는 함수\n", - " token = self._run_strip_accents(token) #4\n", - " split_tokens.extend(self._run_split_on_punc(token)) #5\n", - " output_tokens = whitespace_tokenize(\" \".join(split_tokens)) #6\n", - " return output_tokens\n", - " ```\n", - "- 순서대로 보겠다." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'심사숙고/NNG 하/XSV 였/EP 겠/EP 지만/EC 참으로/MAG 유감/NNG 이/JX 야/EC'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#1. unicode 변환\n", - "text = convert_to_unicode(text)\n", - "text" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "## Tokenize하면서 계속 사용된 character 단위 함수 정의\n", - "def _is_control(char):\n", - " if char == \"\\t\" or char == \"\\n\" or char == \"\\r\":\n", - " # 개행문자이면 False 반환\n", - " return False\n", - " cat = unicodedata.category(char)\n", - " if cat.startswith(\"C\"):\n", - " # unicode category가\n", - " # Cc(Control)\n", - " # Cf(format)\n", - " # Co(Private Use, is 0)\n", - " # Cs(Surrrogate, is 0)일 경우, True 반환\n", - " # https://en.wikipedia.org/wiki/Control_character\n", - " return True\n", - " # 이 외의 경우 전부 False 반환\n", - " return False\n", - " \n", - "def _is_whitespace(char):\n", - " if char == \" \" or char == '\\t' or char == '\\n' or char == '\\r':\n", - " # 개행문자이거나 띄어쓰기면 True 반환\n", - " return True\n", - " cat = unicodedata.category(char)\n", - " if cat == 'Zs':\n", - " # unicode category가 Space Seperator면 True 반환\n", - " # https://www.compart.com/en/unicode/category/Zs\n", - " return True\n", - " # 이 외의 경우 전부 False 반환\n", - " return False\n", - "\n", - "def _is_punctuation(char):\n", - " # 한국어 형태소 분석기이기 때문에 공백과 같은지 여부만 반환\n", - " return char == ' '" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'심사숙고/NNG 하/XSV 였/EP 겠/EP 지만/EC 참으로/MAG 유감/NNG 이/JX 야/EC'" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#2. text cleaning\n", - "def _clean_text(text):\n", - " output = [] # char을 저장할 list 생성\n", - " for char in text:\n", - " # 텍스트에서 Char 단위로 출력\n", - " cp = ord(char)\n", - " if cp == 0 or cp == 0xfffd or _is_control(char):\n", - " # \\x00이거나 �이거나 unicode cat.이 C로 시작할 경우\n", - " # (개행문자 제외) output에 추가하지 않는다.\n", - " continue\n", - " if _is_whitespace(char):\n", - " # 공백일 경우 \" \"으로 output에 추가\n", - " output.append(\" \")\n", - " else:\n", - " # 이 외의 경우 전부 output에 추가\n", - " output.append(char)\n", - " # cleaning 작업을 거친 Text를 후처리하여 반환\n", - " return \"\".join(output)\n", - "\n", - "_clean_text(text) # 뭐가 변했을까? 안변한거 같지>?" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'심사숙고했겠지만 참으로 유감이야'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 더러운 text가 있다고 생각해봐.\n", - "dirty_text = '심사\\x00숙고했겠�지만 참으로 유감이야'\n", - "_clean_text(dirty_text) # 이렇게 바꿔준다.\n", - " # 영어 BERT에는 중국어 변환, 기타 unicode도 신경쓰는데\n", - " # ETRI에서 이렇게 수정해서 코드를 배포했으니 잘 사용하도록 하자!\n", - " # 아니 근데 생각해보니까 이거 형태소 분석 전에 실시해야하는거 아니야?\n", - " # 코드 다시 짜는거 생각해보자" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['심사숙고/NNG',\n", - " '하/XSV',\n", - " '였/EP',\n", - " '겠/EP',\n", - " '지만/EC',\n", - " '참으로/MAG',\n", - " '유감/NNG',\n", - " '이/JX',\n", - " '야/EC']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#3. whitespacing(띄어쓰기로 token화)\n", - "# text 단위 공백 처리\n", - "def whitespace_tokenize(text):\n", - " \"\"\"Runs basic whitespace cleaning and splitting on a piece of text.\"\"\"\n", - " text = text.strip() # 양 사이드의 공백을 제거\n", - " if not text: # 어떠한 값도 없을 시, 빈 list를 반환\n", - " return []\n", - " tokens = text.split() # 공백 단위로 쪼갠 list를 반환\n", - " return tokens\n", - "\n", - "orig_tokens = whitespace_tokenize(text)\n", - "orig_tokens" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'심사숙고/NNG'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "split_tokens = []\n", - "# >>> 첫 번째 for loop\n", - "token = orig_tokens[0]\n", - "token # for loop 돌리기 전에 어떻게 돌아가는지 체크\n", - "# token = '심사 숙고'" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "심사숙고/NNG >> ['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n", - "print(token) == 심사숙고/NNG (사실 출력시에는 변화 X)\n" - ] - }, - { - "data": { - "text/plain": [ - "'심사숙고/NNG'" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#4. 음절을 정준분해\n", - "print(token, end=' >> ')\n", - "token = unicodedata.normalize(\"NFD\", token)\n", - "print(list(token))\n", - "print('print(token) ==', token, '(사실 출력시에는 변화 X)')\n", - "# https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d\n", - "# 모든 음절을 정준 분해(Canonical Decomposition)시킴\n", - "# '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일)\n", - "output = []\n", - "for char in token:\n", - " cat = unicodedata.category(char)\n", - " if cat == \"Mn\":\n", - " # unicode category가 \"Mark, Nonspacing\"일 경우 pass\n", - " continue\n", - " output.append(char)\n", - "token = ''.join(output)\n", - "token # if문에 해당하는 char가 없었기에 원본 text를 출력\n", - " # 정준분해된 상태임을 기억해라" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n" - ] - } - ], - "source": [ - "#5. punctuation 구분(사실상 의미가 없다)\n", - "chars = list(token)\n", - "i, start_new_word = 0, True\n", - "output = []\n", - "print(chars)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ']]\n", - "ᅵ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ']]\n", - "ᆷ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ']]\n", - "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ']]\n", - "ᅡ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ']]\n", - "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ']]\n", - "ᅮ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ']]\n", - "ᆨ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ']]\n", - "ᄀ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ']]\n", - "ᅩ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/']]\n", - "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N']]\n", - "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N']]\n", - "G\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']]\n" - ] - }, - { - "data": { - "text/plain": [ - "['심사숙고/NNG']" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print('char\\t_is_puntuation(char)\\tstart_new_word\\tOutput')\n", - "while i < len(chars):\n", - " char = chars[i]\n", - " print(char, end='\\t')\n", - " print(_is_punctuation(char), end='\\t\\t\\t')\n", - " if _is_punctuation(char):\n", - " print('In Here!! ')\n", - " output.append([char])\n", - " start_new_word = True\n", - " else:\n", - " if start_new_word:\n", - " output.append([])\n", - " start_new_word = False\n", - " output[-1].append(char)\n", - " print(start_new_word, end='\\t\\t')\n", - " print(output)\n", - " i += 1\n", - "[\"\".join(x) for x in output]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "심사숙고/NNG >> ['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n", - "print(token) == 심사숙고/NNG (사실 출력시에는 변화 X)\n", - "['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ']]\n", - "ᅵ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ']]\n", - "ᆷ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ']]\n", - "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ']]\n", - "ᅡ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ']]\n", - "ᄉ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ']]\n", - "ᅮ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ']]\n", - "ᆨ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ']]\n", - "ᄀ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ']]\n", - "ᅩ\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/']]\n", - "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N']]\n", - "N\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N']]\n", - "G\tFalse\t\t\tFalse\t\t[['ᄉ', 'ᅵ', 'ᆷ', 'ᄉ', 'ᅡ', 'ᄉ', 'ᅮ', 'ᆨ', 'ᄀ', 'ᅩ', '/', 'N', 'N', 'G']]\n", - "하/XSV >> ['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n", - "print(token) == 하/XSV (사실 출력시에는 변화 X)\n", - "['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄒ\tFalse\t\t\tFalse\t\t[['ᄒ']]\n", - "ᅡ\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/']]\n", - "X\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/', 'X']]\n", - "S\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/', 'X', 'S']]\n", - "V\tFalse\t\t\tFalse\t\t[['ᄒ', 'ᅡ', '/', 'X', 'S', 'V']]\n", - "였/EP >> ['ᄋ', 'ᅧ', 'ᆻ', '/', 'E', 'P']\n", - "print(token) == 였/EP (사실 출력시에는 변화 X)\n", - "['ᄋ', 'ᅧ', 'ᆻ', '/', 'E', 'P']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n", - "ᅧ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ']]\n", - "ᆻ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ', '/']]\n", - "E\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ', '/', 'E']]\n", - "P\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅧ', 'ᆻ', '/', 'E', 'P']]\n", - "겠/EP >> ['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']\n", - "print(token) == 겠/EP (사실 출력시에는 변화 X)\n", - "['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄀ\tFalse\t\t\tFalse\t\t[['ᄀ']]\n", - "ᅦ\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ']]\n", - "ᆻ\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ', '/']]\n", - "E\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ', '/', 'E']]\n", - "P\tFalse\t\t\tFalse\t\t[['ᄀ', 'ᅦ', 'ᆻ', '/', 'E', 'P']]\n", - "지만/EC >> ['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E', 'C']\n", - "print(token) == 지만/EC (사실 출력시에는 변화 X)\n", - "['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E', 'C']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄌ\tFalse\t\t\tFalse\t\t[['ᄌ']]\n", - "ᅵ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ']]\n", - "ᄆ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ']]\n", - "ᅡ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ']]\n", - "ᆫ\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/']]\n", - "E\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E']]\n", - "C\tFalse\t\t\tFalse\t\t[['ᄌ', 'ᅵ', 'ᄆ', 'ᅡ', 'ᆫ', '/', 'E', 'C']]\n", - "참으로/MAG >> ['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A', 'G']\n", - "print(token) == 참으로/MAG (사실 출력시에는 변화 X)\n", - "['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A', 'G']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄎ\tFalse\t\t\tFalse\t\t[['ᄎ']]\n", - "ᅡ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ']]\n", - "ᆷ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ']]\n", - "ᄋ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ']]\n", - "ᅳ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ']]\n", - "ᄅ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ']]\n", - "ᅩ\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/']]\n", - "M\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M']]\n", - "A\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A']]\n", - "G\tFalse\t\t\tFalse\t\t[['ᄎ', 'ᅡ', 'ᆷ', 'ᄋ', 'ᅳ', 'ᄅ', 'ᅩ', '/', 'M', 'A', 'G']]\n", - "유감/NNG >> ['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "print(token) == 유감/NNG (사실 출력시에는 변화 X)\n", - "['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N', 'G']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n", - "ᅲ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ']]\n", - "ᄀ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ']]\n", - "ᅡ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ']]\n", - "ᆷ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/']]\n", - "N\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N']]\n", - "N\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N']]\n", - "G\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅲ', 'ᄀ', 'ᅡ', 'ᆷ', '/', 'N', 'N', 'G']]\n", - "이/JX >> ['ᄋ', 'ᅵ', '/', 'J', 'X']\n", - "print(token) == 이/JX (사실 출력시에는 변화 X)\n", - "['ᄋ', 'ᅵ', '/', 'J', 'X']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n", - "ᅵ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ', '/']]\n", - "J\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ', '/', 'J']]\n", - "X\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅵ', '/', 'J', 'X']]\n", - "야/EC >> ['ᄋ', 'ᅣ', '/', 'E', 'C']\n", - "print(token) == 야/EC (사실 출력시에는 변화 X)\n", - "['ᄋ', 'ᅣ', '/', 'E', 'C']\n", - "char\t_is_puntuation(char)\tstart_new_word\tOutput\n", - "ᄋ\tFalse\t\t\tFalse\t\t[['ᄋ']]\n", - "ᅣ\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ']]\n", - "/\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ', '/']]\n", - "E\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ', '/', 'E']]\n", - "C\tFalse\t\t\tFalse\t\t[['ᄋ', 'ᅣ', '/', 'E', 'C']]\n", - "split_tokens: ['심사숙고/NNG', '하/XSV', '였/EP', '겠/EP', '지만/EC', '참으로/MAG', '유감/NNG', '이/JX', '야/EC']\n", - "output_tokens: ['심사숙고/NNG', '하/XSV', '였/EP', '겠/EP', '지만/EC', '참으로/MAG', '유감/NNG', '이/JX', '야/EC']\n" - ] - } - ], - "source": [ - "# for loop을 적용하면 아래와 같이 된다.\n", - "split_tokens = []\n", - "for token in orig_tokens:\n", - " #4. 음절을 정준분해\n", - " print(token, end=' >> ')\n", - " token = unicodedata.normalize(\"NFD\", token)\n", - " print(list(token))\n", - " print('print(token) ==', token, '(사실 출력시에는 변화 X)')\n", - " # https://gist.github.com/Pusnow/aa865fa21f9557fa58d691a8b79f8a6d\n", - " # 모든 음절을 정준 분해(Canonical Decomposition)시킴\n", - " # '각'을 'ㄱ+ㅏ+ㄱ'으로 저장(출력되는 값은 동일)\n", - " output = []\n", - " for char in token:\n", - " cat = unicodedata.category(char)\n", - " if cat == \"Mn\":\n", - " # unicode category가 \"Mark, Nonspacing\"일 경우 pass\n", - " continue\n", - " output.append(char)\n", - " token = ''.join(output) # if문에 해당하는 char가 없었기에 원본 text를 출력\n", - " # 정준분해된 상태임을 기억해라\n", - " #5. punctuation 구분(사실상 의미가 없다)\n", - " chars = list(token)\n", - " i, start_new_word = 0, True\n", - " output = []\n", - " print(chars)\n", - " print('char\\t_is_puntuation(char)\\tstart_new_word\\tOutput')\n", - " while i < len(chars):\n", - " char = chars[i]\n", - " print(char, end='\\t')\n", - " print(_is_punctuation(char), end='\\t\\t\\t')\n", - " if _is_punctuation(char):\n", - " print('In Here!! ')\n", - " output.append([char])\n", - " start_new_word = True\n", - " else:\n", - " if start_new_word:\n", - " output.append([])\n", - " start_new_word = False\n", - " output[-1].append(char)\n", - " print(start_new_word, end='\\t\\t')\n", - " print(output)\n", - " i += 1\n", - " split_tokens.extend([\"\".join(x) for x in output])\n", - "print('split_tokens:', split_tokens)\n", - "output_tokens = whitespace_tokenize(\" \".join(split_tokens))\n", - "print('output_tokens:', output_tokens)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - ">### `BasicTokenizer` 파트 종료" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['심사숙고/NNG_',\n", - " '하/XSV_',\n", - " '였/EP_',\n", - " '겠/EP_',\n", - " '지만/EC_',\n", - " '참으로/MAG_',\n", - " '유감/NNG_',\n", - " '이/JX_',\n", - " '야/EC_']" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "output_tokens = [token + '_' for token in output_tokens]\n", - "output_tokens # ETRI 단어 사전에 맞게 form을 변경" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "unk_token = '[UNK]'\n", - "max_input_chars_per_word = 200" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [], - "source": [ - "vocab = morph_vocab" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "심사숙고/NNG_\n", - "심사숙고/NNG\n", - "심사숙고/NN\n", - "심사숙고/N\n", - "심사숙고/\n", - "심사숙고\n", - "심사숙ᄀ\n", - "심사숙\n", - "심사수\n", - "심사ᄉ\n", - "심사\n", - "심ᄉ\n", - "사숙고/NNG_\n", - "사숙고/NNG\n", - "사숙고/NN\n", - "사숙고/N\n", - "사숙고/\n", - "사숙고\n", - "사숙ᄀ\n", - "사숙\n", - "사수\n", - "사ᄉ\n", - "숙고/NNG_\n", - "숙고/NNG\n", - "숙고/NN\n", - "숙고/N\n", - "숙고/\n", - "숙고\n", - "숙ᄀ\n", - "참으로/MAG_\n", - "참으로/MAG\n", - "참으로/MA\n", - "참으로/M\n", - "참으로/\n", - "참으로\n", - "참으ᄅ\n", - "참으\n", - "참ᄋ\n", - "으로/MAG_\n", - "으로/MAG\n", - "으로/MA\n", - "으로/M\n", - "으로/\n", - "으로\n", - "으ᄅ\n", - "이/JX_\n", - "이/JX\n", - "이/J\n", - "이/\n" - ] - } - ], - "source": [ - "SPLIT_TOKENS = []\n", - "for text in output_tokens:\n", - " text = convert_to_unicode(text)\n", - " _output_tokens = []\n", - " # whitespacing 생략\n", - " chars = list(text)\n", - " if len(chars) > max_input_chars_per_word:\n", - " _output_tokens.append(unk_token)\n", - " is_bad = False\n", - " start = 0\n", - " sub_tokens = []\n", - " while start < len(chars):\n", - " end = len(chars)\n", - " cur_substr = None\n", - " while start < end:\n", - " substr = \"\".join(chars[start:end])\n", - " substr = unicodedata.normalize(\"NFC\", substr)\n", - " if substr in vocab:\n", - " cur_substr = substr\n", - " break\n", - " end -= 1\n", - " print(substr)\n", - " if cur_substr is None:\n", - " is_bad = True\n", - " break\n", - " sub_tokens.append(cur_substr)\n", - " start = end\n", - " if is_bad:\n", - " _output_tokens.append(unk_token)\n", - " else:\n", - " _output_tokens.extend(sub_tokens)\n", - " \n", - " for sub_token in _output_tokens:\n", - " SPLIT_TOKENS.append(sub_token)" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[['[CLS]' '심' '사' '숙' '고/NNG_' '하/XSV_' '였/EP_' '겠/EP_' '지만/EC_' '참' '으'\n", - " '로/MAG_' '유감/NNG_' '이' '/JX_' '야/EC_' '[SEP]']\n", - " ['2' '855' '174' '2341' '576' '9' '840' '124' '74' '1855' '2392' '2337'\n", - " '6770' '134' '3087' '4741' '3']]\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "print(np.vstack((tokens, [morph_vocab[token] for token in tokens])))" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [], - "source": [ - "max_seq_length = 200" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(SPLIT_TOKENS) > max_seq_length - 2" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [], - "source": [ - "# The convention in BERT is:\n", - "# (a) For sequence pairs:\n", - "# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]\n", - "# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1\n", - "# (b) For single sequences:\n", - "# tokens: [CLS] the dog is hairy . [SEP]\n", - "# type_ids: 0 0 0 0 0 0 0\n", - "#\n", - "# Where \"type_ids\" are used to indicate whether this is the first\n", - "# sequence or the second sequence. The embedding vectors for 'type=0' and\n", - "# 'type=1' were learned during pre-training and are added to the wordpiece\n", - "# embedding vector (and position vector). This is not \"strictly\" necessary\n", - "# since the [SEP] token unambigiously separates the sequences, but it makes\n", - "# if easier for the model to learn the concept of sequences.\n", - "#\n", - "# For classification tasks, the first vector (corresponding to [CLS]) is\n", - "# used as the \"sentence vector\". Note that this only makes sense because\n", - "# the entire model is fine-tuned.\n", - "\n", - "tokens = [\"[CLS]\"] + SPLIT_TOKENS + [\"[SEP]\"]\n", - "segment_ids = [0] * len(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['[CLS]', '심', '사', '숙', '고/NNG_', '하/XSV_', '였/EP_', '겠/EP_', '지만/EC_', '참', '으', '로/MAG_', '유감/NNG_', '이', '/JX_', '야/EC_', '[SEP]']\n" - ] - } - ], - "source": [ - "print(tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]" - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "segment_ids" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2, 855, 174, 2341, 576, 9, 840, 124, 74, 1855, 2392, 2337, 6770, 134, 3087, 4741, 3]\n" - ] - } - ], - "source": [ - "input_ids = [morph_vocab[token] for token in tokens]\n", - "print(input_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# The mask has 1 for real tokens and 0 for padding tokens. Only real\n", - "# tokens are attended to.\n", - "input_mask = [1] * len(input_ids)\n", - "input_mask" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [], - "source": [ - "# Zero-pad up to the sequence length.\n", - "padding = [0] * (max_seq_length - len(input_ids))\n", - "input_ids += padding\n", - "input_mask += padding\n", - "segment_ids += padding" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": {}, - "outputs": [], - "source": [ - "assert len(input_ids) == max_seq_length\n", - "assert len(input_mask) == max_seq_length\n", - "assert len(segment_ids) == max_seq_length" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "basic", - "language": "python", - "name": "basic" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}