词频统计

import os
import codecs
import chardet


word_lst = []
word_dict = {}

infile=input("请输入统计文件名:")
outfile=input("请输入输出结果文件名:")


exclude_str = input("请输入过滤字符:")
n = input("统计前多少位:")

#oldfile:UTF8文件的路径
#newfile:要保存的ANSI文件的路径
def convertUTF8ToANSI(infile):

    #打开UTF8文本文件
    f = codecs.open(infile,'r','utf8')
    utfstr = f.read()
    f.close()
    
    #把UTF8字符串转码成ANSI字符串
    outansestr = utfstr.encode('mbcs')

    #使用二进制格式保存转码后的文本
    f = open(infile,'wb')
    f.write(outansestr)
    f.close()

#读取docx文档
def ReadWord():
    # 读取word

    fword = docx.Document(infile) # 括号内的为文件路径
    for para in fword.paragraphs: # 读取word的每一段内容
          for char in para.text:
              word_lst.append(char)
          #para.text即为该段落的内容
    print(word_lst) # 返回读取到的文件内容

def ReadTxt():
    try:
        convertUTF8ToANSI(infile)
    except :
        print("编码格式错误")
    
    # 添加每一个字到列表中
    for line in fileIn:
        for char in line:
            word_lst.append(char)
           

    
with open(infile,"r") as fileIn ,open(outfile,'w') as fileOut:
    fileName = infile.split('.')
    if fileName[-1] == "docx":
        ReadWord()
    if fileName[-1] == "txt":
        ReadTxt()
             
  
    # 用字典统计每个字出现的个数
    word_lst=[x.strip() for x in word_lst if x.strip() != '']
    for char in word_lst:
        if char not in exclude_str:
            word_dict[char] = word_dict.get(char,0) + 1
  

    # 排序
    #   x[1]是按字频排序,x[0]则是按字排序
    lstWords = sorted(word_dict.items(), key=lambda x:x[1],  reverse=True)

    # 输出结果 (前100)
    print('字符\t字频')
    print('=============')
    i = 1
    if n == '':
        for e in lstWords:
            print(str(i) + '\t' + '%s\t%d' % e)
            i+=1
            fileOut.write('%s, %d\n' % e)
    else:
        n = int(n)
        for e in lstWords[:n]:
            print(str(i) + '\t' + '%s\t%d' % e)
            i+=1
            fileOut.write('%s, %d\n' % e)

 

posted @ 2020-05-18 16:37  王者2  阅读(247)  评论(0编辑  收藏  举报