import os
import codecs
import chardet
word_lst = []
word_dict = {}
infile=input("请输入统计文件名:")
outfile=input("请输入输出结果文件名:")
exclude_str = input("请输入过滤字符:")
n = input("统计前多少位:")
#oldfile:UTF8文件的路径
#newfile:要保存的ANSI文件的路径
def convertUTF8ToANSI(infile):
#打开UTF8文本文件
f = codecs.open(infile,'r','utf8')
utfstr = f.read()
f.close()
#把UTF8字符串转码成ANSI字符串
outansestr = utfstr.encode('mbcs')
#使用二进制格式保存转码后的文本
f = open(infile,'wb')
f.write(outansestr)
f.close()
#读取docx文档
def ReadWord():
# 读取word
fword = docx.Document(infile) # 括号内的为文件路径
for para in fword.paragraphs: # 读取word的每一段内容
for char in para.text:
word_lst.append(char)
#para.text即为该段落的内容
print(word_lst) # 返回读取到的文件内容
def ReadTxt():
try:
convertUTF8ToANSI(infile)
except :
print("编码格式错误")
# 添加每一个字到列表中
for line in fileIn:
for char in line:
word_lst.append(char)
with open(infile,"r") as fileIn ,open(outfile,'w') as fileOut:
fileName = infile.split('.')
if fileName[-1] == "docx":
ReadWord()
if fileName[-1] == "txt":
ReadTxt()
# 用字典统计每个字出现的个数
word_lst=[x.strip() for x in word_lst if x.strip() != '']
for char in word_lst:
if char not in exclude_str:
word_dict[char] = word_dict.get(char,0) + 1
# 排序
# x[1]是按字频排序,x[0]则是按字排序
lstWords = sorted(word_dict.items(), key=lambda x:x[1], reverse=True)
# 输出结果 (前100)
print('字符\t字频')
print('=============')
i = 1
if n == '':
for e in lstWords:
print(str(i) + '\t' + '%s\t%d' % e)
i+=1
fileOut.write('%s, %d\n' % e)
else:
n = int(n)
for e in lstWords[:n]:
print(str(i) + '\t' + '%s\t%d' % e)
i+=1
fileOut.write('%s, %d\n' % e)