数据挖掘与算法 第四次作业
练习一:中文小说 词频统计
代码:
import jieba fo = open('红楼梦.txt','r',encoding='utf-8') strho = fo.read() #通过文件读取字符串 str fo.close() print(strho) #对文本进行预处理 sep = ',。?!;:‘’“” "" ' for sh in sep: strho = strho.replace(sh,'') print(strho) #单词计数字典 set strset = set(strho) print(len(strset),strset) ##单词计数dict strDict={} for word in strset: strDict[word]=strho.count(word) print(len(strDict),strDict) #把字典转换为列表 hlouList = list(strDict.items()) print(hlouList) #按词频排序 list.sort(key=) hlouList.sort(key=lambda x: x[1], reverse=True) print(hlouList) #.输出TOP(20) for i in range(20): print(hlouList[i])
运行结果:

练习二:英文小说 词频统计
代码:
fo = open('英文.txt','r',encoding='utf-8') stryw = fo.read() #通过文件读取字符串 str fo.close() print(stryw) #特殊符号处理 sep = ',。?!;:‘’“”' for sh in sep: stryw = stryw.replace(sh,'') print(stryw) #分解提取单词 list strlist = str.split(stryw) print(len(strlist),strlist) #单词计数set strSet = set(strlist) print(len(strSet),strSet) #单词计数dict字典形式 strDict={} for word in strSet: strDict[word]=stryw.count(word) print(len(strDict),strDict) #把字典转换为列表 List = list(strDict.items()) print(List) ##词频排序list.sort(key=) Dict = dict(strDict) DictList = list(Dict.items()) print(DictList) DictList.sort(key=lambda x:x[1],reverse = True) print(DictList) #排除语法型词汇,代词、冠词、连词等无语义词 strSet = set(strSet) exclude = {'a','and','the','I','you','to'} exset = strSet-exclude print(len(exset),exset) #.输出TOP(20) for i in range(20): print(DictList[i])
运行结果:

浙公网安备 33010602011771号