diff --git a/utils.py b/utils.py index 2648c500..e0827a91 100644 --- a/utils.py +++ b/utils.py @@ -6,6 +6,7 @@ from tqdm import tqdm import time from datetime import timedelta +import jieba MAX_VOCAB_SIZE = 10000 # 词表长度限制 @@ -30,7 +31,7 @@ def build_vocab(file_path, tokenizer, max_size, min_freq): def build_dataset(config, ues_word): if ues_word: - tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level + tokenizer = lambda x: list(jieba.cut(x)) else: tokenizer = lambda x: [y for y in x] # char-level if os.path.exists(config.vocab_path): diff --git a/utils_fasttext.py b/utils_fasttext.py index fe349252..4eea99da 100644 --- a/utils_fasttext.py +++ b/utils_fasttext.py @@ -6,6 +6,7 @@ from tqdm import tqdm import time from datetime import timedelta +import jieba MAX_VOCAB_SIZE = 10000 @@ -30,7 +31,7 @@ def build_vocab(file_path, tokenizer, max_size, min_freq): def build_dataset(config, ues_word): if ues_word: - tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level + tokenizer = lambda x: list(jieba.cut(x)) else: tokenizer = lambda x: [y for y in x] # char-level if os.path.exists(config.vocab_path):