文件方式实现完整的英文词频统计实例

1.读入待分析的字符串

2.分解提取单词 

3.计数字典

4.排除语法型词汇

5.排序

6.输出TOP(20)

fo=open ('test.py','r')  #1.读入
b=fo.read()
exc={'the','of','and','on','a','in','by','since'}  
b.lower()
for i in ',.?-_!':
    b=b.replace(i,' ')
words=b.split()#单词的列表2.分解提取单词
#print(words)
di={}
keys=set(words)#出现单词的集合,字典的key
keys=keys-exc  #排除词汇
#print(keys)

for i in keys:
    di[i]=0
    di[i]=words.count(i)#单词出现的次数3.计数
#print(di)
    
wc=list(di.items())#列表
wc.sort(key=lambda x:x[1],reverse=True)#排序5.输入top20
#print(wc)

for i in range(15):
    print(wc[i])
fo.close()

  结果

#单词列表
["Myanmar's", 'Aung', 'San', 'Suu', 'Kyi', 'is', 'facing', 'mounting', 'international', 'pressure', 'for', 'her', 'handling', 'of', 'violence', 'in', 'Rakhine', 'state', 'and', 'the', 'Rohingya', 'refugee', 'crisis', 'In', 'a', 'speech', 'on', 'Tuesday', 'the', 'de', 'facto', 'leader', 'condemned', 'rights', 'abuses', 'but', 'did', 'not', 'blame', 'the', 'army', 'or', 'address', 'allegations', 'of', 'ethnic', 'cleansing', 'Leaders', 'and', 'diplomats', 'from', 'several', 'countries', 'have', 'since', 'expressed', 'strong', 'disappointment', 'with', 'her', 'stance', 'More', 'than', '400', '000', 'Rohingya', 'have', 'fled', 'to', 'Bangladesh', 'since', 'late', 'August', 'The', 'latest', 'unrest', 'in', 'troubled', 'Rakhine', 'was', 'sparked', 'by', 'deadly', 'attacks', 'on', 'police', 'stations', 'across', 'the', 'state', 'last', 'month', 'blamed', 'on', 'a', 'newly', 'emerged', 'militant', 'group', 'the', 'Arakan', 'Rohingya', 'Salvation', 'Army', '(Arsa)', 'Scores', 'of', 'people', 'were', 'killed', 'in', 'an', 'ensuing', 'military', 'crackdown', 'and', 'there', 'are', 'widespread', 'allegations', 'of', 'villages', 'being', 'burned', 'and', 'Rohingya', 'being', 'driven', 'out']
#提取单词
{'Aung', 'facto', 'condemned', 'Rohingya', 'killed', 'is', 'with', 'More', '000', 'stance', 'latest', 'unrest', 'several', 'month', 'international', 'sparked', 'ethnic', 'group', 'her', 'blame', 'last', 'speech', 'to', 'Rakhine', 'out', 'facing', 'widespread', 'de', 'Leaders', 'violence', 'late', 'Tuesday', "Myanmar's", 'army', 'stations', 'or', 'San', 'but', 'did', 'Suu', 'handling', 'Army', 'an', 'burned', 'blamed', 'allegations', 'address', 'disappointment', 'there', 'In', 'not', 'mounting', 'police', 'are', '(Arsa)', 'Arakan', 'deadly', 'military', 'pressure', 'have', 'crisis', 'being', 'across', 'August', 'leader', 'from', 'for', 'The', 'Kyi', 'than', 'abuses', 'people', 'Scores', 'Bangladesh', 'cleansing', 'fled', 'rights', 'militant', 'emerged', '400', 'driven', 'refugee', 'troubled', 'expressed', 'were', 'crackdown', 'ensuing', 'strong', 'state', 'countries', 'was', 'diplomats', 'Salvation', 'villages', 'newly', 'attacks'}
#计数
{'Aung': 1, 'facto': 1, 'condemned': 1, 'Rohingya': 4, 'killed': 1, 'is': 1, 'with': 1, 'More': 1, '000': 1, 'stance': 1, 'latest': 1, 'unrest': 1, 'several': 1, 'month': 1, 'international': 1, 'sparked': 1, 'ethnic': 1, 'group': 1, 'her': 2, 'blame': 1, 'last': 1, 'speech': 1, 'to': 1, 'Rakhine': 2, 'out': 1, 'facing': 1, 'widespread': 1, 'de': 1, 'Leaders': 1, 'violence': 1, 'late': 1, 'Tuesday': 1, "Myanmar's": 1, 'army': 1, 'stations': 1, 'or': 1, 'San': 1, 'but': 1, 'did': 1, 'Suu': 1, 'handling': 1, 'Army': 1, 'an': 1, 'burned': 1, 'blamed': 1, 'allegations': 2, 'address': 1, 'disappointment': 1, 'there': 1, 'In': 1, 'not': 1, 'mounting': 1, 'police': 1, 'are': 1, '(Arsa)': 1, 'Arakan': 1, 'deadly': 1, 'military': 1, 'pressure': 1, 'have': 2, 'crisis': 1, 'being': 2, 'across': 1, 'August': 1, 'leader': 1, 'from': 1, 'for': 1, 'The': 1, 'Kyi': 1, 'than': 1, 'abuses': 1, 'people': 1, 'Scores': 1, 'Bangladesh': 1, 'cleansing': 1, 'fled': 1, 'rights': 1, 'militant': 1, 'emerged': 1, '400': 1, 'driven': 1, 'refugee': 1, 'troubled': 1, 'expressed': 1, 'were': 1, 'crackdown': 1, 'ensuing': 1, 'strong': 1, 'state': 2, 'countries': 1, 'was': 1, 'diplomats': 1, 'Salvation': 1, 'villages': 1, 'newly': 1, 'attacks': 1}
#排序
[('Rohingya', 4), ('her', 2), ('Rakhine', 2), ('allegations', 2), ('have', 2), ('being', 2), ('state', 2), ('Aung', 1), ('facto', 1), ('condemned', 1), ('killed', 1), ('is', 1), ('with', 1), ('More', 1), ('000', 1), ('stance', 1), ('latest', 1), ('unrest', 1), ('several', 1), ('month', 1), ('international', 1), ('sparked', 1), ('ethnic', 1), ('group', 1), ('blame', 1), ('last', 1), ('speech', 1), ('to', 1), ('out', 1), ('facing', 1), ('widespread', 1), ('de', 1), ('Leaders', 1), ('violence', 1), ('late', 1), ('Tuesday', 1), ("Myanmar's", 1), ('army', 1), ('stations', 1), ('or', 1), ('San', 1), ('but', 1), ('did', 1), ('Suu', 1), ('handling', 1), ('Army', 1), ('an', 1), ('burned', 1), ('blamed', 1), ('address', 1), ('disappointment', 1), ('there', 1), ('In', 1), ('not', 1), ('mounting', 1), ('police', 1), ('are', 1), ('(Arsa)', 1), ('Arakan', 1), ('deadly', 1), ('military', 1), ('pressure', 1), ('crisis', 1), ('across', 1), ('August', 1), ('leader', 1), ('from', 1), ('for', 1), ('The', 1), ('Kyi', 1), ('than', 1), ('abuses', 1), ('people', 1), ('Scores', 1), ('Bangladesh', 1), ('cleansing', 1), ('fled', 1), ('rights', 1), ('militant', 1), ('emerged', 1), ('400', 1), ('driven', 1), ('refugee', 1), ('troubled', 1), ('expressed', 1), ('were', 1), ('crackdown', 1), ('ensuing', 1), ('strong', 1), ('countries', 1), ('was', 1), ('diplomats', 1), ('Salvation', 1), ('villages', 1), ('newly', 1), ('attacks', 1)]
#输出前20
('Rohingya', 4)
('her', 2)
('Rakhine', 2)
('allegations', 2)
('have', 2)
('being', 2)
('state', 2)
('Aung', 1)
('facto', 1)
('condemned', 1)
('killed', 1)
('is', 1)
('with', 1)
('More', 1)
('000', 1)
('stance', 1)
('latest', 1)
('unrest', 1)
('several', 1)
('month', 1)

  

posted on 2017-09-26 09:43  004陈楠芸  阅读(146)  评论(0编辑  收藏  举报