阶段作业1:完整的中英文词频统计

fo = open('music.txt','r',encoding='utf-8')   #读取文件
strmusic = fo.read().lower()   #转换小写
fo.close()
print(strmusic)
sep = ''' .,:;?!-_'''    #删除特殊字符
for ch in sep:
    strmusic = strmusic.replace(ch,' ')
strList = strmusic.split()   #分割字符
print(len(strList),strList)
strSet = set(strList)    #将字符转为列表
exclude = {'a','the','and','i','you','in','de','but','that','not','if','will','s','m','it'} #删除无语义词
strSet = strSet-exclude
print(len(strSet),strSet)
strDict = {}   #单词计数字典
for word in strSet:
    strDict[word] = strList.count(word)
print(len(strDict),strDict)
for word in strSet:    #单词计数集合
    strDict[word] = strList.count(word)
print(len(strDict),strDict)

wordlist = list(strDict.items())
wordlist.sort(key=lambda  x:x[1],reverse=True)   #用lambda函数排序
print(strList)

for i in range(20):   #输出TOP(20)
    print(wordlist[i])

  

  
中文
import jieba
txt = open('xiaoshuo.txt','r',encoding='utf-8').read()
wordsls = jieba.lcut(txt)
wcdict = {}
for word in wordsls:
    if len(word)==1:
        continue
    else:wcdict[word]=wcdict.get(word,0)+1

wcls=list(wcdict.items())
wcls.sort(key=lambda x:x[1],reverse=True)

for i in range(25):
    print(wcls[i])

  

 
posted @ 2018-10-15 09:55  李健朗  阅读(128)  评论(0编辑  收藏  举报