第四次作业
一、英文小说词频统计
##统计英文小说词频 #读取字符串str f = open('like.txt','r',encoding='utf-8') like= f.read() f.close() print(like)
运行结果

#字符串预处理 #全部转化为小写 a = str.lower(like) print(a)
运行结果

#去除标点符号 sep = ",.、!:?…" for ch in sep: like = like.replace(ch,"") print(like)
运行结果

#分解提取单词 list strList = like.split() print(len(strList),strList)
运行结果

#单词计数字典 set(不重复) strSet = set(strList) print(len(strSet),strSet)
运行结果

#单词计数字典 dict strDict = {} for word in strSet: strDict[word] = strList.count(word) print(len(strDict),strDict)
运行结果

# 词频排序 list.sort(key = ) ccList = list(strDict.items()) print(ccList) ccList.sort(key=lambda x: x[1], reverse=True) print(ccList)
运行结果

#输出TOP(20) for i in range(20): b = ccList[i] print(b)
运行结果

#排除语法型词汇,代词、冠词、连词等无语义词 strSet = set(strSet) exclude = {'a','the','and','oh'} strSet = strSet - exclude print(len(strSet),strSet)
运行结果

二、中文小说词频统计
##统计中文小说词频 import jieba #读取字符串str t = open('xiyouji.txt','r',encoding='utf-8') xiyouji= t.read().lower() t.close() print(xiyouji) #字符串预处理 #去除标点符号 sep = ",.、!:?…" for ch in sep: xiyouji = xiyouji.replace(ch,"") print(xiyouji) #分解提取单词 list strList = xiyouji.split() print(len(strList),strList) #单词计数字典 set(不重复) strSet = set(strList) print(len(strSet),strSet) #单词计数字典 dict strDict = {} for word in strSet: strDict[word] = strList.count(word) print(len(strDict),strDict) # 词频排序 list.sort(key = ) ccList = list(strDict.items()) print(ccList) ccList.sort(key=lambda x: x[1], reverse=True) print(ccList) #输出TOP(20) for i in range(20): b = ccList[i] print(b) #排除语法型词汇,代词、冠词、连词等无语义词 strSet = set(strSet) exclude = {'啊','哦','了','呢','和','你'} strSet = strSet - exclude print(len(strSet),strSet)
运行结果



浙公网安备 33010602011771号