From 98d8998af19ad8e51ad111d7f6be1ddb03c675d3 Mon Sep 17 00:00:00 2001
From: trgho <trghoer@gmail.com>
Date: Fri, 21 Jul 2017 00:49:40 +0800
Subject: [PATCH 1/3] =?UTF-8?q?add=20A15564-0x04.py=20#=20=E4=BC=A4?=
 =?UTF-8?q?=E5=BF=83=E7=9A=84=E7=9A=AE=E8=9B=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 A15564-0x04.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 A15564-0x04.py

diff --git a/A15564-0x04.py b/A15564-0x04.py
new file mode 100644
index 0000000..9b3a0bc
--- /dev/null
+++ b/A15564-0x04.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+
+import codecs
+import os
+
+#1. 读取文件
+#['aa', 'aaa-bbb-sds'] => ['aa', 'aaa', 'bbb', 'sds']
+def word_split(words):
+    new_list = []
+    for word in words:
+        if '-' not in word:
+            new_list.append(word)
+        else:
+            lst = word.split('-')
+            new_list.extend(lst)
+    return new_list
+
+
+def read_file(file_path):
+    f = codecs.open(file_path, 'r', "utf-8") #打开文件
+    lines = f.readlines()
+    word_list = []
+    for line in lines:
+        line = line.strip()
+        words = line.split(" ") #用空格分割
+        words = word_split(words) #用-分割
+        word_list.extend(words)
+    return word_list
+
+def get_file_from_folder(folder_path):
+    file_paths = []
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    return file_paths
+
+#读取多文件里的单词
+def read_files(file_paths):
+    final_words = []
+    for path in file_paths:
+        final_words.extend(read_file(path))
+    return final_words
+
+
+#2. 获取格式化之后的单词
+def format_word(word):
+    fmt = 'abcdefghijklmnopqrstuvwxyz-'
+    for char in word:
+        if char not in fmt:
+            word = word.replace(char, '')
+    return word.lower()
+
+def format_words(words):
+    word_list = []
+    for word in words:
+        wd = format_word(word)
+        if wd:
+            word_list.append(wd)
+    return word_list
+
+#3. 统计单词数目
+# {'aa':4, 'bb':1}
+def statistics_words(words):
+    s_word_dict = {}
+    for word in words:
+        if s_word_dict.has_key(word):
+            s_word_dict[word] = s_word_dict[word] + 1
+        else:
+            s_word_dict[word] = 1
+    #排序
+    sorted_dict = sorted(s_word_dict.iteritems(), key=lambda d: d[1], reverse=True)
+    return sorted_dict
+
+
+#4. 计算单词累计百分比
+def word_rating(vocabulary_list, total_count):
+    current_count = 0
+    rate_list = [] # new list for word frequency + rate
+    for val in vocabulary_list:
+        num = val[1] # word frequency
+        current_count += num
+        word_rate = (float(current_count) / total_count) * 100 # accumulated percentage
+        rate_tuple = (val[0], val[1], word_rate)
+        rate_list.append(rate_tuple)
+    return rate_list
+
+
+#5. 截取百分比内的单词
+def in_section_words(rate_list, rate_section):  # get words within given rate section
+    final_list = [] # new list for words within given rate section
+    for val in rate_list:
+        if val[2] >= 100 * rate_section[0] and val[2] <= 100 * rate_section[1]: 
+            rate_tuple = (val[0], val[1], val[2])
+            final_list.append(rate_tuple)
+    return final_list
+
+
+#6. 输出成csv
+def print_to_csv(final_list, to_file_path):
+    nfile = open(to_file_path,'w+')
+    for val in final_list:
+        nfile.write("%s, %s, %0.2f%%\n" % (val[0], str(val[1]), val[2]))
+    nfile.close()
+
+#4'. 输出成csv
+def print_to_csv2(word_list, to_file_path):
+    nfile = open(to_file_path,'w+')
+    for val in word_list:
+        nfile.write("%s, %s\n" % (val[0], str(val[1])))
+
+def main():
+    #1. 读取文本
+    words = read_files(get_file_from_folder('data1'))
+    print '获取了未格式化的单词 %d 个' % (len(words))
+
+    #2. 清洗文本
+    f_words = format_words(words)
+    print '获取了已经格式化的单词 %d 个 ' %(len(f_words))
+    total_word_count = len(f_words)
+
+    #3. 统计单词和排序
+    word_list = statistics_words(f_words)
+    print '最终总单词数 %d 个 ' %(len(word_list))
+
+    # 是否进行百分比统计
+    rating = True # True for rating, False for not rating
+    if rating:
+        #4. 计算单词累计百分比
+        rate_list = word_rating(word_list, total_word_count) # inherit rate_list from word_rating()
+
+        #5. 截取百分比内的单词
+        start_and_end = [0.5, 0.7] #截取这一部分的单词
+        final_list = in_section_words(rate_list, start_and_end)
+
+        #6. 输出文件
+        print_to_csv(final_list, 'output/test2.csv')
+    else: # not rating
+        #4'. 输出文件
+        print_to_csv2(word_list, 'output/test3.csv')
+
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 9d849dd3b5c8bad10f13e858cd76051208451c17 Mon Sep 17 00:00:00 2001
From: trgho <trghoer@gmail.com>
Date: Fri, 21 Jul 2017 21:43:11 +0800
Subject: [PATCH 2/3] update A15564-0x04 to 0x05

---
 A15564-0x04.py => A15564-0x05.py | 48 ++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 11 deletions(-)
 rename A15564-0x04.py => A15564-0x05.py (71%)

diff --git a/A15564-0x04.py b/A15564-0x05.py
similarity index 71%
rename from A15564-0x04.py
rename to A15564-0x05.py
index 9b3a0bc..9c5c4a7 100644
--- a/A15564-0x04.py
+++ b/A15564-0x05.py
@@ -95,12 +95,31 @@ def in_section_words(rate_list, rate_section):  # get words within given rate se
             final_list.append(rate_tuple)
     return final_list
 
+#6. 获取释义
+def get_explanation(file_path, current_list):
+    f = codecs.open(file_path, 'r', "utf-8") #打开文件
+    lines = f.readlines()
+    word_dic = {}
+    for line in lines:
+        line = line.strip()
+        line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!!
+        words = line.split("   ") #用空格分割
+        # words = word_split(words) #用-分割  # not necessary
+        word_dic[words[0]] = words[1]
+    fi_final_list = []
+    for val in current_list:
+        if word_dic.has_key(val[0]):
+            word_tuple = (val[0], val[1], val[2], word_dic[val[0]])
+        else:
+            word_tuple = (val[0], val[1], val[2], '#暂无释义#')
+        fi_final_list.append(word_tuple)
+    return fi_final_list
 
-#6. 输出成csv
+#7. 输出成csv
 def print_to_csv(final_list, to_file_path):
     nfile = open(to_file_path,'w+')
     for val in final_list:
-        nfile.write("%s, %s, %0.2f%%\n" % (val[0], str(val[1]), val[2]))
+        nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
     nfile.close()
 
 #4'. 输出成csv
@@ -112,16 +131,16 @@ def print_to_csv2(word_list, to_file_path):
 def main():
     #1. 读取文本
     words = read_files(get_file_from_folder('data1'))
-    print '获取了未格式化的单词 %d 个' % (len(words))
+    print '获取了未格式化的单词 %d 个' % len(words)
 
     #2. 清洗文本
     f_words = format_words(words)
-    print '获取了已经格式化的单词 %d 个 ' %(len(f_words))
+    print '获取了已经格式化的单词 %d 个 ' % len(f_words)
     total_word_count = len(f_words)
 
     #3. 统计单词和排序
     word_list = statistics_words(f_words)
-    print '最终总单词数 %d 个 ' %(len(word_list))
+    print '最终总单词数 %d 个 ' % len(word_list)
 
     # 是否进行百分比统计
     rating = True # True for rating, False for not rating
@@ -133,13 +152,20 @@ def main():
         start_and_end = [0.5, 0.7] #截取这一部分的单词
         final_list = in_section_words(rate_list, start_and_end)
 
-        #6. 输出文件
-        print_to_csv(final_list, 'output/test2.csv')
-    else: # not rating
-        #4'. 输出文件
-        print_to_csv2(word_list, 'output/test3.csv')
+        import sys  ### to solve UnicodeEncodeError
+        reload(sys)
+        sys.setdefaultencoding('utf-8') ###
+
+        #6. 获取释义
+        fi_final_list = get_explanation('8000-words.txt', final_list)
+        print '生成单词表，应背单词 %d 个' % len(fi_final_list)
 
+        #7. 输出文件
+        print_to_csv(fi_final_list, 'output/with_meaning.csv')
 
+    else: # not rating
+        #4'. 输出文件
+        print_to_csv2(word_list, 'output/all_words.csv')
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From 888245beea8f1976af734d89ee1e3f05cd5e2561 Mon Sep 17 00:00:00 2001
From: trgho <trghoer@gmail.com>
Date: Sun, 23 Jul 2017 10:46:06 +0800
Subject: [PATCH 3/3] update A15564-0x05 to 0x06

---
 A15564-0x05.py => A15564-0x06.py | 58 +++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 4 deletions(-)
 rename A15564-0x05.py => A15564-0x06.py (68%)

diff --git a/A15564-0x05.py b/A15564-0x06.py
similarity index 68%
rename from A15564-0x05.py
rename to A15564-0x06.py
index 9c5c4a7..5951648 100644
--- a/A15564-0x05.py
+++ b/A15564-0x06.py
@@ -102,10 +102,21 @@ def get_explanation(file_path, current_list):
     word_dic = {}
     for line in lines:
         line = line.strip()
-        line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!!
-        words = line.split("   ") #用空格分割
-        # words = word_split(words) #用-分割  # not necessary
+        ###1 len(word_dic) == 7982
+        # line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!!
+        # words = line.split("   ") #用空格分割
+
+        ###2 len(word_dic) == 7977
+        words = line.split(' ', 1) # split the first space
+        words2 = []
+        for word in words:
+            word = word.strip() # strip spaces before and behind meaning
+            if word:
+                words2.append(word) # remove ''
+        words = words2
+
         word_dic[words[0]] = words[1]
+    # print len(word_dic)
     fi_final_list = []
     for val in current_list:
         if word_dic.has_key(val[0]):
@@ -122,6 +133,24 @@ def print_to_csv(final_list, to_file_path):
         nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
     nfile.close()
 
+#8. 生成每日单词表
+def create_daily_lists(fi_final_list, daily_amount, list_number, to_folder_path):
+    item_index = 0
+    day_index = 1
+
+    while day_index < list_number: # till day before the last day
+        nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+')
+        for val in fi_final_list[item_index : item_index + daily_amount]:
+            nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
+        nfile.close()
+        item_index += daily_amount
+        day_index += 1
+    nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+')
+
+    for val in fi_final_list[item_index : ]: # the last day
+            nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
+    nfile.close()
+
 #4'. 输出成csv
 def print_to_csv2(word_list, to_file_path):
     nfile = open(to_file_path,'w+')
@@ -158,11 +187,32 @@ def main():
 
         #6. 获取释义
         fi_final_list = get_explanation('8000-words.txt', final_list)
-        print '生成单词表，应背单词 %d 个' % len(fi_final_list)
+        total_amount = len(fi_final_list)
+        print '获得累计百分比范围内单词 %d 个' % total_amount
 
         #7. 输出文件
         print_to_csv(fi_final_list, 'output/with_meaning.csv')
 
+        #8. 生成每日单词表
+        daily_amount = 50
+        if total_amount % daily_amount == 0:
+            list_number = total_amount / daily_amount
+        else:
+            list_number = total_amount / daily_amount + 1
+            last_list_number = total_amount % daily_amount
+
+
+        # 单词表是否乱序
+        out_of_order = True # True for disordered lists, False for sequenced lists
+        if out_of_order: 
+            import random
+            random.shuffle(fi_final_list)
+
+        if not os.path.exists('output/daily_lists/'): ### 强迫症 
+            os.mkdir('output/daily_lists/')                      ###
+        create_daily_lists(fi_final_list, daily_amount, list_number, 'output/daily_lists/')
+        print '生成单词表 %d 个，除最终表外每表含单词 %d 个，最终表含单词 %d 个' % (list_number, daily_amount, last_list_number)
+
     else: # not rating
         #4'. 输出文件
         print_to_csv2(word_list, 'output/all_words.csv')