python实现简单的英文词频统计

  1 __author__ = 'Oscar_Yang'
  2 # -*- coding= utf-8 -*-
  3 #copyRight by OSCAR
  4 """
  5 本脚本实现,合并几个英文文本,并且统计词频。
  6 脚本定义了几个函数:
  7 1、文件读取函数readFile(读取文件,输出每个文件的词频);
  8 2、元素为词频的字典的合并函数,并且实现相同词的词频相加,返回全部词频;
  9 3、调试部分,利用了高阶函数:map,reduce;
 10 4、最后实现格式化输出,输入结果如图片所示。
 11 """
 12 import functools
 13 #定义文件读取函数,并且输出元素为词频的字典
 14 def readFile(file_name):
 15     y = []
 16     with open(file_name, 'r',encoding="utf-8") as f:
 17         x=f.readlines()
 18     for line in x:
 19         y.extend(line.split())
 20     word_list2 = []
 21 
 22     # 单词格式化:去掉分词之后部分英文前后附带的标点符号
 23     for word in y:
 24         # last character of each word
 25         word1 = word
 26 
 27         # use a list of punctuation marks
 28         while True:
 29             lastchar = word1[-1:]
 30             if lastchar in [",", ".", "!", "?", ";", '"']:
 31                 word2 = word1.rstrip(lastchar)
 32                 word1 = word2
 33             else:
 34                 word2 = word1
 35                 break
 36 
 37         while True:
 38             firstchar = word2[0]
 39             if firstchar in [",", ".", "!", "?", ";", '"']:
 40                 word3 = word2.lstrip(firstchar)
 41                 word2 = word3
 42             else:
 43                 word3 = word2
 44                 break
 45                 # build a wordList of lower case modified words
 46         word_list2.append(word3)
 47     #统计词频
 48     tf = {}
 49     for word in word_list2:
 50         word = word.lower()
 51             # print(word)
 52         word = ''.join(word.split())
 53         if word in tf:
 54             tf[word] += 1
 55         else:
 56             tf[word] = 1
 57     return tf
 58 
 59 def get_counts(words):
 60     tf = {}
 61     for word in words:
 62         word = word.lower()
 63         # print(word)
 64         word = ''.join(word.split())
 65         if word in tf:
 66             tf[word] += 1
 67         else:
 68             tf[word] = 1
 69 
 70 
 71 #合并两个字典的方法1
 72 def merge1(dic1, dic2):
 73     for k, v in dic1.items():
 74         if k in dic2.keys():
 75             dic2[k] += v
 76         else:
 77             dic2[k] = v
 78     # print(dic2)
 79     return dic2
 80 
 81 #合并两个字典的方法2
 82 def merge2(dic1, dic2):
 83     from collections import Counter
 84     counts = Counter(dic1) + Counter(dic2)
 85     return counts
 86 
 87 #获得前n个最热词和词频
 88 def top_counts(word_list,n=10):
 89     value_key_pairs = sorted([(count, tz) for tz, count in word_list.items()],reverse=True)
 90     return value_key_pairs[:n]
 91     # print(value_key_pairs[:n])
 92 
 93 #测试部分
 94 if __name__ == '__main__':
 95     file_list = [r'E:\graduate\Python\python那些事\articles\article_000.txt',
 96                  r'E:\graduate\Python\python那些事\articles\article_001.txt',
 97                  r'E:\graduate\Python\python那些事\articles\article_002.txt',
 98                  r'E:\graduate\Python\python那些事\articles\article_003.txt',
 99                  r'E:\graduate\Python\python那些事\articles\article_004.txt',
100                  r'E:\graduate\Python\python那些事\articles\article_005.txt']
101 
102     cc=map(readFile,file_list)
103     word_list = functools.reduce(merge2,cc)
104     top_counts=top_counts(word_list)
105     # print(top_counts)
106     print ("最常用的单词排行榜:")
107     for word in top_counts[0:10]:
108         print("{0:10}{1}".format(word[1], word[0]))

2016-10-15

运行结果:

posted @ 2016-10-15 18:56  coskaka  阅读(8814)  评论(0编辑  收藏  举报