阶段作业1:完整的中英文词频统计
fo = open('music.txt','r',encoding='utf-8') #读取文件 strmusic = fo.read().lower() #转换小写 fo.close() print(strmusic) sep = ''' .,:;?!-_''' #删除特殊字符 for ch in sep: strmusic = strmusic.replace(ch,' ') strList = strmusic.split() #分割字符 print(len(strList),strList) strSet = set(strList) #将字符转为列表 exclude = {'a','the','and','i','you','in','de','but','that','not','if','will','s','m','it'} #删除无语义词 strSet = strSet-exclude print(len(strSet),strSet) strDict = {} #单词计数字典 for word in strSet: strDict[word] = strList.count(word) print(len(strDict),strDict) for word in strSet: #单词计数集合 strDict[word] = strList.count(word) print(len(strDict),strDict) wordlist = list(strDict.items()) wordlist.sort(key=lambda x:x[1],reverse=True) #用lambda函数排序 print(strList) for i in range(20): #输出TOP(20) print(wordlist[i])
中文
import jieba txt = open('xiaoshuo.txt','r',encoding='utf-8').read() wordsls = jieba.lcut(txt) wcdict = {} for word in wordsls: if len(word)==1: continue else:wcdict[word]=wcdict.get(word,0)+1 wcls=list(wcdict.items()) wcls.sort(key=lambda x:x[1],reverse=True) for i in range(25): print(wcls[i])