阶段作业1:完整的中英文词频统计
fo = open('music.txt','r',encoding='utf-8') #读取文件
strmusic = fo.read().lower() #转换小写
fo.close()
print(strmusic)
sep = ''' .,:;?!-_''' #删除特殊字符
for ch in sep:
strmusic = strmusic.replace(ch,' ')
strList = strmusic.split() #分割字符
print(len(strList),strList)
strSet = set(strList) #将字符转为列表
exclude = {'a','the','and','i','you','in','de','but','that','not','if','will','s','m','it'} #删除无语义词
strSet = strSet-exclude
print(len(strSet),strSet)
strDict = {} #单词计数字典
for word in strSet:
strDict[word] = strList.count(word)
print(len(strDict),strDict)
for word in strSet: #单词计数集合
strDict[word] = strList.count(word)
print(len(strDict),strDict)
wordlist = list(strDict.items())
wordlist.sort(key=lambda x:x[1],reverse=True) #用lambda函数排序
print(strList)
for i in range(20): #输出TOP(20)
print(wordlist[i])

中文
import jieba
txt = open('xiaoshuo.txt','r',encoding='utf-8').read()
wordsls = jieba.lcut(txt)
wcdict = {}
for word in wordsls:
if len(word)==1:
continue
else:wcdict[word]=wcdict.get(word,0)+1
wcls=list(wcdict.items())
wcls.sort(key=lambda x:x[1],reverse=True)
for i in range(25):
print(wcls[i])


浙公网安备 33010602011771号