From 0eb99f9041754bb7d4f78d025b76b20b87cc5adb Mon Sep 17 00:00:00 2001
From: Tab Zhang <1071629548@qq.com>
Date: Thu, 1 Aug 2019 23:54:09 +0800
Subject: [PATCH 1/2] Update utils.py

---
 utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils.py b/utils.py
index 2648c500..e0827a91 100644
--- a/utils.py
+++ b/utils.py
@@ -6,6 +6,7 @@
 from tqdm import tqdm
 import time
 from datetime import timedelta
+import jieba
 
 
 MAX_VOCAB_SIZE = 10000  # 词表长度限制
@@ -30,7 +31,7 @@ def build_vocab(file_path, tokenizer, max_size, min_freq):
 
 def build_dataset(config, ues_word):
     if ues_word:
-        tokenizer = lambda x: x.split(' ')  # 以空格隔开，word-level
+        tokenizer = lambda x: list(jieba.cut(x))
     else:
         tokenizer = lambda x: [y for y in x]  # char-level
     if os.path.exists(config.vocab_path):

From 4413809cabb71083efe2b2832ff96dfa5466a597 Mon Sep 17 00:00:00 2001
From: Tab Zhang <1071629548@qq.com>
Date: Thu, 1 Aug 2019 23:54:46 +0800
Subject: [PATCH 2/2] Update utils_fasttext.py

---
 utils_fasttext.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils_fasttext.py b/utils_fasttext.py
index fe349252..4eea99da 100644
--- a/utils_fasttext.py
+++ b/utils_fasttext.py
@@ -6,6 +6,7 @@
 from tqdm import tqdm
 import time
 from datetime import timedelta
+import jieba
 
 
 MAX_VOCAB_SIZE = 10000
@@ -30,7 +31,7 @@ def build_vocab(file_path, tokenizer, max_size, min_freq):
 
 def build_dataset(config, ues_word):
     if ues_word:
-        tokenizer = lambda x: x.split(' ')  # 以空格隔开，word-level
+        tokenizer = lambda x: list(jieba.cut(x))
     else:
         tokenizer = lambda x: [y for y in x]  # char-level
     if os.path.exists(config.vocab_path):