From 0eb99f9041754bb7d4f78d025b76b20b87cc5adb Mon Sep 17 00:00:00 2001 From: Tab Zhang <1071629548@qq.com> Date: Thu, 1 Aug 2019 23:54:09 +0800 Subject: [PATCH 1/2] Update utils.py --- utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 2648c500..e0827a91 100644 --- a/utils.py +++ b/utils.py @@ -6,6 +6,7 @@ from tqdm import tqdm import time from datetime import timedelta +import jieba MAX_VOCAB_SIZE = 10000 # 词表长度限制 @@ -30,7 +31,7 @@ def build_vocab(file_path, tokenizer, max_size, min_freq): def build_dataset(config, ues_word): if ues_word: - tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level + tokenizer = lambda x: list(jieba.cut(x)) else: tokenizer = lambda x: [y for y in x] # char-level if os.path.exists(config.vocab_path): From 4413809cabb71083efe2b2832ff96dfa5466a597 Mon Sep 17 00:00:00 2001 From: Tab Zhang <1071629548@qq.com> Date: Thu, 1 Aug 2019 23:54:46 +0800 Subject: [PATCH 2/2] Update utils_fasttext.py --- utils_fasttext.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils_fasttext.py b/utils_fasttext.py index fe349252..4eea99da 100644 --- a/utils_fasttext.py +++ b/utils_fasttext.py @@ -6,6 +6,7 @@ from tqdm import tqdm import time from datetime import timedelta +import jieba MAX_VOCAB_SIZE = 10000 @@ -30,7 +31,7 @@ def build_vocab(file_path, tokenizer, max_size, min_freq): def build_dataset(config, ues_word): if ues_word: - tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level + tokenizer = lambda x: list(jieba.cut(x)) else: tokenizer = lambda x: [y for y in x] # char-level if os.path.exists(config.vocab_path):