作业4
完整词频统计:
1.英文
代码如下:
#准备文件
#读取文件
fo = open("abc.txt","r",encoding='utf-8')
read = fo.read()
fo.close()
#预处理
read = read.strip().lower()
for i in '''.,?;:''':
read = read.replace(i,' ')
#分解单词
readList = read.split()
#去除无用词汇
readSet = set(readList)
bug = {'i','my','but','and','you','the','if','on','are'}
readSet = readSet - bug
#单词计数字典
dic = {}
for word in readSet:
dic[word] = readList.count(word)
#排序
li = list(dic.items())
li.sort(key=lambda x:x[1],reverse=True)
#找出前20个
for i in range(20):
print(li[i])
结果如下:
('go', 11)
('heart', 9)
('in', 5)
('will', 5)
('that', 5)
('one', 3)
('love', 3)
('far', 3)
("you're", 3)
('here', 3)
('us', 2)
('door', 2)
('wherever', 2)
('once', 2)
('open', 2)
('is', 2)
("we'll", 2)
('does', 2)
('believe', 2)
('know', 2)
2.中文
代码如下:
import jieba
fo = open("def.txt","r",encoding='utf-8')
read = fo.read()
fo.close()
bug = ',。?!;:'
for i in bug:
read = read.replace(i,'')
cut = list(jieba.cut(read))
dic = {}
for word in cut:
if len(word)==1:
continue
else:
dic[word] = dic.get(word,0)+1
li = list(dic.items())
li.sort(key=lambda x:x[1],reverse=True)
for a in range(15):
print(li[a])
结果如下:
('胡萝卜', 3)
('山药', 3)
('维生素', 3)
('黄瓜', 2)
('南瓜', 2)
('丰富', 2)
('分解', 2)
('炖汤', 1)
('时候', 1)
('不要', 1)
('一起', 1)
('两者', 1)
('引起', 1)
('中毒', 1)
('例子', 1)

浙公网安备 33010602011771号