Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 221 additions & 0 deletions A15564-0x06.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-

import codecs
import os

#1. 读取文件
#['aa', 'aaa-bbb-sds'] => ['aa', 'aaa', 'bbb', 'sds']
def word_split(words):
new_list = []
for word in words:
if '-' not in word:
new_list.append(word)
else:
lst = word.split('-')
new_list.extend(lst)
return new_list


def read_file(file_path):
f = codecs.open(file_path, 'r', "utf-8") #打开文件
lines = f.readlines()
word_list = []
for line in lines:
line = line.strip()
words = line.split(" ") #用空格分割
words = word_split(words) #用-分割
word_list.extend(words)
return word_list

def get_file_from_folder(folder_path):
file_paths = []
for root, dirs, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
file_paths.append(file_path)
return file_paths

#读取多文件里的单词
def read_files(file_paths):
final_words = []
for path in file_paths:
final_words.extend(read_file(path))
return final_words


#2. 获取格式化之后的单词
def format_word(word):
fmt = 'abcdefghijklmnopqrstuvwxyz-'
for char in word:
if char not in fmt:
word = word.replace(char, '')
return word.lower()

def format_words(words):
word_list = []
for word in words:
wd = format_word(word)
if wd:
word_list.append(wd)
return word_list

#3. 统计单词数目
# {'aa':4, 'bb':1}
def statistics_words(words):
s_word_dict = {}
for word in words:
if s_word_dict.has_key(word):
s_word_dict[word] = s_word_dict[word] + 1
else:
s_word_dict[word] = 1
#排序
sorted_dict = sorted(s_word_dict.iteritems(), key=lambda d: d[1], reverse=True)
return sorted_dict


#4. 计算单词累计百分比
def word_rating(vocabulary_list, total_count):
current_count = 0
rate_list = [] # new list for word frequency + rate
for val in vocabulary_list:
num = val[1] # word frequency
current_count += num
word_rate = (float(current_count) / total_count) * 100 # accumulated percentage
rate_tuple = (val[0], val[1], word_rate)
rate_list.append(rate_tuple)
return rate_list


#5. 截取百分比内的单词
def in_section_words(rate_list, rate_section): # get words within given rate section
final_list = [] # new list for words within given rate section
for val in rate_list:
if val[2] >= 100 * rate_section[0] and val[2] <= 100 * rate_section[1]:
rate_tuple = (val[0], val[1], val[2])
final_list.append(rate_tuple)
return final_list

#6. 获取释义
def get_explanation(file_path, current_list):
f = codecs.open(file_path, 'r', "utf-8") #打开文件
lines = f.readlines()
word_dic = {}
for line in lines:
line = line.strip()
###1 len(word_dic) == 7982
# line = line.replace(']', ' ') ### BUG KILLER ### very important!!!!!!
# words = line.split(" ") #用空格分割

###2 len(word_dic) == 7977
words = line.split(' ', 1) # split the first space
words2 = []
for word in words:
word = word.strip() # strip spaces before and behind meaning
if word:
words2.append(word) # remove ''
words = words2

word_dic[words[0]] = words[1]
# print len(word_dic)
fi_final_list = []
for val in current_list:
if word_dic.has_key(val[0]):
word_tuple = (val[0], val[1], val[2], word_dic[val[0]])
else:
word_tuple = (val[0], val[1], val[2], '#暂无释义#')
fi_final_list.append(word_tuple)
return fi_final_list

#7. 输出成csv
def print_to_csv(final_list, to_file_path):
nfile = open(to_file_path,'w+')
for val in final_list:
nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
nfile.close()

#8. 生成每日单词表
def create_daily_lists(fi_final_list, daily_amount, list_number, to_folder_path):
item_index = 0
day_index = 1

while day_index < list_number: # till day before the last day
nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+')
for val in fi_final_list[item_index : item_index + daily_amount]:
nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
nfile.close()
item_index += daily_amount
day_index += 1
nfile = open('%s%s.csv' % (to_folder_path, day_index), 'w+')

for val in fi_final_list[item_index : ]: # the last day
nfile.write("%s, %s, %0.2f%%, %s\n" % (val[0], str(val[1]), val[2], val[3]))
nfile.close()

#4'. 输出成csv
def print_to_csv2(word_list, to_file_path):
nfile = open(to_file_path,'w+')
for val in word_list:
nfile.write("%s, %s\n" % (val[0], str(val[1])))

def main():
#1. 读取文本
words = read_files(get_file_from_folder('data1'))
print '获取了未格式化的单词 %d 个' % len(words)

#2. 清洗文本
f_words = format_words(words)
print '获取了已经格式化的单词 %d 个 ' % len(f_words)
total_word_count = len(f_words)

#3. 统计单词和排序
word_list = statistics_words(f_words)
print '最终总单词数 %d 个 ' % len(word_list)

# 是否进行百分比统计
rating = True # True for rating, False for not rating
if rating:
#4. 计算单词累计百分比
rate_list = word_rating(word_list, total_word_count) # inherit rate_list from word_rating()

#5. 截取百分比内的单词
start_and_end = [0.5, 0.7] #截取这一部分的单词
final_list = in_section_words(rate_list, start_and_end)

import sys ### to solve UnicodeEncodeError
reload(sys)
sys.setdefaultencoding('utf-8') ###

#6. 获取释义
fi_final_list = get_explanation('8000-words.txt', final_list)
total_amount = len(fi_final_list)
print '获得累计百分比范围内单词 %d 个' % total_amount

#7. 输出文件
print_to_csv(fi_final_list, 'output/with_meaning.csv')

#8. 生成每日单词表
daily_amount = 50
if total_amount % daily_amount == 0:
list_number = total_amount / daily_amount
else:
list_number = total_amount / daily_amount + 1
last_list_number = total_amount % daily_amount


# 单词表是否乱序
out_of_order = True # True for disordered lists, False for sequenced lists
if out_of_order:
import random
random.shuffle(fi_final_list)

if not os.path.exists('output/daily_lists/'): ### 强迫症
os.mkdir('output/daily_lists/') ###
create_daily_lists(fi_final_list, daily_amount, list_number, 'output/daily_lists/')
print '生成单词表 %d 个,除最终表外每表含单词 %d 个,最终表含单词 %d 个' % (list_number, daily_amount, last_list_number)

else: # not rating
#4'. 输出文件
print_to_csv2(word_list, 'output/all_words.csv')

if __name__ == "__main__":
main()