Angelzhao · ghost · Jul 20, 2017 · Jul 21, 2017 · Jul 23, 2017
diff --git a/A15564-0x06.py b/A15564-0x06.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+
+import codecs
+import os
+
+#1. 读取文件
+#['aa', 'aaa-bbb-sds'] => ['aa', 'aaa', 'bbb', 'sds']
+def word_split(words):
+    new_list = []
+    for word in words:
+        if '-' not in word:
+            new_list.append(word)
+        else:
+            lst = word.split('-')
+            new_list.extend(lst)
+    return new_list
+
+
+def read_file(file_path):
+    f = codecs.open(file_path, 'r', "utf-8") #打开文件
+    lines = f.readlines()
+    word_list = []
+    for line in lines:
+        line = line.strip()
+        words = line.split(" ") #用空格分割
+        words = word_split(words) #用-分割
+        word_list.extend(words)
+    return word_list
+
+def get_file_from_folder(folder_path):
+    file_paths = []
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    return file_paths
+
+#读取多文件里的单词
+def read_files(file_paths):
+    final_words = []
+    for path in file_paths:
+        final_words.extend(read_file(path))
+    return final_words
+
+
+#2. 获取格式化之后的单词
+def format_word(word):
+    fmt = 'abcdefghijklmnopqrstuvwxyz-'
+    for char in word:
+        if char not in fmt:
+            word = word.replace(char, '')
+    return word.lower()
+
+def format_words(words):
+    word_list = []
+    for word in words:
+        wd = format_word(word)
+        if wd:
+            word_list.append(wd)
+    return word_list
+
+#3. 统计单词数目
+# {'aa':4, 'bb':1}
+def statistics_words(words):
+    s_word_dict = {}
+    for word in words:
+        if s_word_dict.has_key(word):
+            s_word_dict[word] = s_word_dict[word] + 1
+        else:
+            s_word_dict[word] = 1
+    #排序
+    sorted_dict = sorted(s_word_dict.iteritems(), key=lambda d: d[1], reverse=True)
+    return sorted_dict
+
+
+#4. 计算单词累计百分比
+def word_rating(vocabulary_list, total_count):
+    current_count = 0
+    rate_list = [] # new list for word frequency + rate
+    for val in vocabulary_list:
+        num = val[1] # word frequency
+        current_count += num
+        word_rate = (float(current_count) / total_count) * 100 # accumulated percentage
+        rate_tuple = (val[0], val[1], word_rate)
+        rate_list.append(rate_tuple)
+    return rate_list
+
+
+#5. 截取百分比内的单词
+def in_section_words(rate_list, rate_section):  # get words within given rate section
+    final_list = [] # new list for words within given rate section
+    for val in rate_list:
+        if val[2] >= 100 * rate_section[0] and val[2] <= 100 * rate_section[1]: 
+            rate_tuple = (val[0], val[1], val[2])
+            final_list.append(rate_tuple)
+    return final_list
+
+#6. 获取释义
+def get_explanation(file_path, current_list):
+    f = codecs.open(file_path, 'r', "utf-8") #打开文件
+    lines = f.readlines()
+    word_dic = {}
+    for line in lines:
+        line = line.strip()
+        ###1 len(word_dic) == 7982
+        # line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!!
+        # words = line.split("   ") #用空格分割
+
+        ###2 len(word_dic) == 7977
+        words = line.split(' ', 1) # split the first space
+        words2 = []
+        for word in words:
+            word = word.strip() # strip spaces before and behind meaning
+            if word:
+                words2.append(word) # remove ''
+        words = words2
+
+        word_dic[words[0]] = words[1]
+    # print len(word_dic)
+    fi_final_list = []
+    for val in current_list:
+        if word_dic.has_key(val[0]):
+            word_tuple = (val[0], val[1], val[2], word_dic[val[0]])
+        else:
+            word_tuple = (val[0], val[1], val[2], '#暂无释义#')
+        fi_final_list.append(word_tuple)
+    return fi_final_list
+
+#7. 输出成csv
+def print_to_csv(final_list, to_file_path):
+    nfile = open(to_file_path,'w+')
+    for val in final_list:
+        nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
+    nfile.close()
+
+#8. 生成每日单词表
+def create_daily_lists(fi_final_list, daily_amount, list_number, to_folder_path):
+    item_index = 0
+    day_index = 1
+
+    while day_index < list_number: # till day before the last day
+        nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+')
+        for val in fi_final_list[item_index : item_index + daily_amount]:
+            nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
+        nfile.close()
+        item_index += daily_amount
+        day_index += 1
+    nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+')
+
+    for val in fi_final_list[item_index : ]: # the last day
+            nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
+    nfile.close()
+
+#4'. 输出成csv
+def print_to_csv2(word_list, to_file_path):
+    nfile = open(to_file_path,'w+')
+    for val in word_list:
+        nfile.write("%s, %s\n" % (val[0], str(val[1])))
+
+def main():
+    #1. 读取文本
+    words = read_files(get_file_from_folder('data1'))
+    print '获取了未格式化的单词 %d 个' % len(words)
+
+    #2. 清洗文本
+    f_words = format_words(words)
+    print '获取了已经格式化的单词 %d 个 ' % len(f_words)
+    total_word_count = len(f_words)
+
+    #3. 统计单词和排序
+    word_list = statistics_words(f_words)
+    print '最终总单词数 %d 个 ' % len(word_list)
+
+    # 是否进行百分比统计
+    rating = True # True for rating, False for not rating
+    if rating:
+        #4. 计算单词累计百分比
+        rate_list = word_rating(word_list, total_word_count) # inherit rate_list from word_rating()
+
+        #5. 截取百分比内的单词
+        start_and_end = [0.5, 0.7] #截取这一部分的单词
+        final_list = in_section_words(rate_list, start_and_end)
+
+        import sys  ### to solve UnicodeEncodeError
+        reload(sys)
+        sys.setdefaultencoding('utf-8') ###
+
+        #6. 获取释义
+        fi_final_list = get_explanation('8000-words.txt', final_list)
+        total_amount = len(fi_final_list)
+        print '获得累计百分比范围内单词 %d 个' % total_amount
+
+        #7. 输出文件
+        print_to_csv(fi_final_list, 'output/with_meaning.csv')
+
+        #8. 生成每日单词表
+        daily_amount = 50
+        if total_amount % daily_amount == 0:
+            list_number = total_amount / daily_amount
+        else:
+            list_number = total_amount / daily_amount + 1
+            last_list_number = total_amount % daily_amount
+
+
+        # 单词表是否乱序
+        out_of_order = True # True for disordered lists, False for sequenced lists
+        if out_of_order: 
+            import random
+            random.shuffle(fi_final_list)
+
+        if not os.path.exists('output/daily_lists/'): ### 强迫症 
+            os.mkdir('output/daily_lists/')                      ###
+        create_daily_lists(fi_final_list, daily_amount, list_number, 'output/daily_lists/')
+        print '生成单词表 %d 个，除最终表外每表含单词 %d 个，最终表含单词 %d 个' % (list_number, daily_amount, last_list_number)
+
+    else: # not rating
+        #4'. 输出文件
+        print_to_csv2(word_list, 'output/all_words.csv')
+
+if __name__ == "__main__":
+    main()