From 98d8998af19ad8e51ad111d7f6be1ddb03c675d3 Mon Sep 17 00:00:00 2001 From: trgho Date: Fri, 21 Jul 2017 00:49:40 +0800 Subject: [PATCH 1/3] =?UTF-8?q?add=20A15564-0x04.py=20#=20=E4=BC=A4?= =?UTF-8?q?=E5=BF=83=E7=9A=84=E7=9A=AE=E8=9B=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- A15564-0x04.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 A15564-0x04.py diff --git a/A15564-0x04.py b/A15564-0x04.py new file mode 100644 index 0000000..9b3a0bc --- /dev/null +++ b/A15564-0x04.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +import codecs +import os + +#1. 读取文件 +#['aa', 'aaa-bbb-sds'] => ['aa', 'aaa', 'bbb', 'sds'] +def word_split(words): + new_list = [] + for word in words: + if '-' not in word: + new_list.append(word) + else: + lst = word.split('-') + new_list.extend(lst) + return new_list + + +def read_file(file_path): + f = codecs.open(file_path, 'r', "utf-8") #打开文件 + lines = f.readlines() + word_list = [] + for line in lines: + line = line.strip() + words = line.split(" ") #用空格分割 + words = word_split(words) #用-分割 + word_list.extend(words) + return word_list + +def get_file_from_folder(folder_path): + file_paths = [] + for root, dirs, files in os.walk(folder_path): + for file in files: + file_path = os.path.join(root, file) + file_paths.append(file_path) + return file_paths + +#读取多文件里的单词 +def read_files(file_paths): + final_words = [] + for path in file_paths: + final_words.extend(read_file(path)) + return final_words + + +#2. 获取格式化之后的单词 +def format_word(word): + fmt = 'abcdefghijklmnopqrstuvwxyz-' + for char in word: + if char not in fmt: + word = word.replace(char, '') + return word.lower() + +def format_words(words): + word_list = [] + for word in words: + wd = format_word(word) + if wd: + word_list.append(wd) + return word_list + +#3. 统计单词数目 +# {'aa':4, 'bb':1} +def statistics_words(words): + s_word_dict = {} + for word in words: + if s_word_dict.has_key(word): + s_word_dict[word] = s_word_dict[word] + 1 + else: + s_word_dict[word] = 1 + #排序 + sorted_dict = sorted(s_word_dict.iteritems(), key=lambda d: d[1], reverse=True) + return sorted_dict + + +#4. 计算单词累计百分比 +def word_rating(vocabulary_list, total_count): + current_count = 0 + rate_list = [] # new list for word frequency + rate + for val in vocabulary_list: + num = val[1] # word frequency + current_count += num + word_rate = (float(current_count) / total_count) * 100 # accumulated percentage + rate_tuple = (val[0], val[1], word_rate) + rate_list.append(rate_tuple) + return rate_list + + +#5. 截取百分比内的单词 +def in_section_words(rate_list, rate_section): # get words within given rate section + final_list = [] # new list for words within given rate section + for val in rate_list: + if val[2] >= 100 * rate_section[0] and val[2] <= 100 * rate_section[1]: + rate_tuple = (val[0], val[1], val[2]) + final_list.append(rate_tuple) + return final_list + + +#6. 输出成csv +def print_to_csv(final_list, to_file_path): + nfile = open(to_file_path,'w+') + for val in final_list: + nfile.write("%s, %s, %0.2f%%\n" % (val[0], str(val[1]), val[2])) + nfile.close() + +#4'. 输出成csv +def print_to_csv2(word_list, to_file_path): + nfile = open(to_file_path,'w+') + for val in word_list: + nfile.write("%s, %s\n" % (val[0], str(val[1]))) + +def main(): + #1. 读取文本 + words = read_files(get_file_from_folder('data1')) + print '获取了未格式化的单词 %d 个' % (len(words)) + + #2. 清洗文本 + f_words = format_words(words) + print '获取了已经格式化的单词 %d 个 ' %(len(f_words)) + total_word_count = len(f_words) + + #3. 统计单词和排序 + word_list = statistics_words(f_words) + print '最终总单词数 %d 个 ' %(len(word_list)) + + # 是否进行百分比统计 + rating = True # True for rating, False for not rating + if rating: + #4. 计算单词累计百分比 + rate_list = word_rating(word_list, total_word_count) # inherit rate_list from word_rating() + + #5. 截取百分比内的单词 + start_and_end = [0.5, 0.7] #截取这一部分的单词 + final_list = in_section_words(rate_list, start_and_end) + + #6. 输出文件 + print_to_csv(final_list, 'output/test2.csv') + else: # not rating + #4'. 输出文件 + print_to_csv2(word_list, 'output/test3.csv') + + + +if __name__ == "__main__": + main() \ No newline at end of file From 9d849dd3b5c8bad10f13e858cd76051208451c17 Mon Sep 17 00:00:00 2001 From: trgho Date: Fri, 21 Jul 2017 21:43:11 +0800 Subject: [PATCH 2/3] update A15564-0x04 to 0x05 --- A15564-0x04.py => A15564-0x05.py | 48 ++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 11 deletions(-) rename A15564-0x04.py => A15564-0x05.py (71%) diff --git a/A15564-0x04.py b/A15564-0x05.py similarity index 71% rename from A15564-0x04.py rename to A15564-0x05.py index 9b3a0bc..9c5c4a7 100644 --- a/A15564-0x04.py +++ b/A15564-0x05.py @@ -95,12 +95,31 @@ def in_section_words(rate_list, rate_section): # get words within given rate se final_list.append(rate_tuple) return final_list +#6. 获取释义 +def get_explanation(file_path, current_list): + f = codecs.open(file_path, 'r', "utf-8") #打开文件 + lines = f.readlines() + word_dic = {} + for line in lines: + line = line.strip() + line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!! + words = line.split(" ") #用空格分割 + # words = word_split(words) #用-分割 # not necessary + word_dic[words[0]] = words[1] + fi_final_list = [] + for val in current_list: + if word_dic.has_key(val[0]): + word_tuple = (val[0], val[1], val[2], word_dic[val[0]]) + else: + word_tuple = (val[0], val[1], val[2], '#暂无释义#') + fi_final_list.append(word_tuple) + return fi_final_list -#6. 输出成csv +#7. 输出成csv def print_to_csv(final_list, to_file_path): nfile = open(to_file_path,'w+') for val in final_list: - nfile.write("%s, %s, %0.2f%%\n" % (val[0], str(val[1]), val[2])) + nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3])) nfile.close() #4'. 输出成csv @@ -112,16 +131,16 @@ def print_to_csv2(word_list, to_file_path): def main(): #1. 读取文本 words = read_files(get_file_from_folder('data1')) - print '获取了未格式化的单词 %d 个' % (len(words)) + print '获取了未格式化的单词 %d 个' % len(words) #2. 清洗文本 f_words = format_words(words) - print '获取了已经格式化的单词 %d 个 ' %(len(f_words)) + print '获取了已经格式化的单词 %d 个 ' % len(f_words) total_word_count = len(f_words) #3. 统计单词和排序 word_list = statistics_words(f_words) - print '最终总单词数 %d 个 ' %(len(word_list)) + print '最终总单词数 %d 个 ' % len(word_list) # 是否进行百分比统计 rating = True # True for rating, False for not rating @@ -133,13 +152,20 @@ def main(): start_and_end = [0.5, 0.7] #截取这一部分的单词 final_list = in_section_words(rate_list, start_and_end) - #6. 输出文件 - print_to_csv(final_list, 'output/test2.csv') - else: # not rating - #4'. 输出文件 - print_to_csv2(word_list, 'output/test3.csv') + import sys ### to solve UnicodeEncodeError + reload(sys) + sys.setdefaultencoding('utf-8') ### + + #6. 获取释义 + fi_final_list = get_explanation('8000-words.txt', final_list) + print '生成单词表,应背单词 %d 个' % len(fi_final_list) + #7. 输出文件 + print_to_csv(fi_final_list, 'output/with_meaning.csv') + else: # not rating + #4'. 输出文件 + print_to_csv2(word_list, 'output/all_words.csv') if __name__ == "__main__": - main() \ No newline at end of file + main() From 888245beea8f1976af734d89ee1e3f05cd5e2561 Mon Sep 17 00:00:00 2001 From: trgho Date: Sun, 23 Jul 2017 10:46:06 +0800 Subject: [PATCH 3/3] update A15564-0x05 to 0x06 --- A15564-0x05.py => A15564-0x06.py | 58 +++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) rename A15564-0x05.py => A15564-0x06.py (68%) diff --git a/A15564-0x05.py b/A15564-0x06.py similarity index 68% rename from A15564-0x05.py rename to A15564-0x06.py index 9c5c4a7..5951648 100644 --- a/A15564-0x05.py +++ b/A15564-0x06.py @@ -102,10 +102,21 @@ def get_explanation(file_path, current_list): word_dic = {} for line in lines: line = line.strip() - line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!! - words = line.split(" ") #用空格分割 - # words = word_split(words) #用-分割 # not necessary + ###1 len(word_dic) == 7982 + # line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!! + # words = line.split(" ") #用空格分割 + + ###2 len(word_dic) == 7977 + words = line.split(' ', 1) # split the first space + words2 = [] + for word in words: + word = word.strip() # strip spaces before and behind meaning + if word: + words2.append(word) # remove '' + words = words2 + word_dic[words[0]] = words[1] + # print len(word_dic) fi_final_list = [] for val in current_list: if word_dic.has_key(val[0]): @@ -122,6 +133,24 @@ def print_to_csv(final_list, to_file_path): nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3])) nfile.close() +#8. 生成每日单词表 +def create_daily_lists(fi_final_list, daily_amount, list_number, to_folder_path): + item_index = 0 + day_index = 1 + + while day_index < list_number: # till day before the last day + nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+') + for val in fi_final_list[item_index : item_index + daily_amount]: + nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3])) + nfile.close() + item_index += daily_amount + day_index += 1 + nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+') + + for val in fi_final_list[item_index : ]: # the last day + nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3])) + nfile.close() + #4'. 输出成csv def print_to_csv2(word_list, to_file_path): nfile = open(to_file_path,'w+') @@ -158,11 +187,32 @@ def main(): #6. 获取释义 fi_final_list = get_explanation('8000-words.txt', final_list) - print '生成单词表,应背单词 %d 个' % len(fi_final_list) + total_amount = len(fi_final_list) + print '获得累计百分比范围内单词 %d 个' % total_amount #7. 输出文件 print_to_csv(fi_final_list, 'output/with_meaning.csv') + #8. 生成每日单词表 + daily_amount = 50 + if total_amount % daily_amount == 0: + list_number = total_amount / daily_amount + else: + list_number = total_amount / daily_amount + 1 + last_list_number = total_amount % daily_amount + + + # 单词表是否乱序 + out_of_order = True # True for disordered lists, False for sequenced lists + if out_of_order: + import random + random.shuffle(fi_final_list) + + if not os.path.exists('output/daily_lists/'): ### 强迫症 + os.mkdir('output/daily_lists/') ### + create_daily_lists(fi_final_list, daily_amount, list_number, 'output/daily_lists/') + print '生成单词表 %d 个,除最终表外每表含单词 %d 个,最终表含单词 %d 个' % (list_number, daily_amount, last_list_number) + else: # not rating #4'. 输出文件 print_to_csv2(word_list, 'output/all_words.csv')