作业4
完整词频统计:
1.英文
代码如下:
#准备文件 #读取文件 fo = open("abc.txt","r",encoding='utf-8') read = fo.read() fo.close() #预处理 read = read.strip().lower() for i in '''.,?;:''': read = read.replace(i,' ') #分解单词 readList = read.split() #去除无用词汇 readSet = set(readList) bug = {'i','my','but','and','you','the','if','on','are'} readSet = readSet - bug #单词计数字典 dic = {} for word in readSet: dic[word] = readList.count(word) #排序 li = list(dic.items()) li.sort(key=lambda x:x[1],reverse=True) #找出前20个 for i in range(20): print(li[i])
结果如下:
('go', 11) ('heart', 9) ('in', 5) ('will', 5) ('that', 5) ('one', 3) ('love', 3) ('far', 3) ("you're", 3) ('here', 3) ('us', 2) ('door', 2) ('wherever', 2) ('once', 2) ('open', 2) ('is', 2) ("we'll", 2) ('does', 2) ('believe', 2) ('know', 2)
2.中文
代码如下:
import jieba fo = open("def.txt","r",encoding='utf-8') read = fo.read() fo.close() bug = ',。?!;:' for i in bug: read = read.replace(i,'') cut = list(jieba.cut(read)) dic = {} for word in cut: if len(word)==1: continue else: dic[word] = dic.get(word,0)+1 li = list(dic.items()) li.sort(key=lambda x:x[1],reverse=True) for a in range(15): print(li[a])
结果如下:
('胡萝卜', 3) ('山药', 3) ('维生素', 3) ('黄瓜', 2) ('南瓜', 2) ('丰富', 2) ('分解', 2) ('炖汤', 1) ('时候', 1) ('不要', 1) ('一起', 1) ('两者', 1) ('引起', 1) ('中毒', 1) ('例子', 1)