中英文分词生成词频
def cut(): str = '' '''open打开歌词数据,加入str''' with open('d:/style.txt','r') as f: for line in f.readlines(): str += line.strip() '''用for循环+replace去除标点符号''' for old in ',,.。??!!': str.replace(old,' ') str = str.lower()#转换大小写 music = str.split(' ')#按空格划分 dic = {} #统计词频 for i in set(music): dic[i] = music.count(i) return dic def sort_dic(dic): return sorted(dic.items(),key= lambda d:d[1],reverse=True)#通过sort传入iterable,if指定为元祖第二个元素 if __name__ == '__main__': dic = cut() print(sort_dic(dic))
英文通过空格分割,中文使用jieba分词并加载停用词表
二、中文分词
1.准备utf-8编码的文本文件file
2.通过文件读取字符串 str
3.对文本进行预处理
4.分解提取单词 list
5.单词计数字典 set , dict
6.按词频排序 list.sort(key=)
7.排除语法型词汇,代词、冠词、连词等无语义词
8.输出TOP(20)
def ch_cut(file,stop): with open(file=file) as f: f = f.read() f = jieba.cut(f) stoplist = [] #加载停用词表stoplist with open(file=stop) as stopword: for i in stopword.readlines(): stoplist.append(i.strip()) li = [i for i in f if i not in stoplist] dic = {} for key in set(li): dic[key] = li.count(key) return sorted(dic.items(),key= lambda d:d[1],reverse=True) if __name__ == '__main__': print(ch_cut('d:/xiaoshuo.txt','d:/stopword.txt')[:20])
生成词云
import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt def cloud(file,stop): with open(file=file) as f: f = f.read() f = jieba.cut(f) stoplist = [] #加载停用词放入stoplist with open(file=stop) as stopword: for i in stopword.readlines(): stoplist.append(i.strip()) li = [i for i in f if i not in stoplist] word = "" for i in li: word+=" ".join(i) word.encode('utf-8') font = r'C:\Windows\Fonts\simfang.ttf' #制定电脑字体,不然会出现中文显示失败 mywc = WordCloud(font_path=font).generate(word) #generate传入必须为字符串 plt.imshow(mywc) plt.axis('off') plt.show()

浙公网安备 33010602011771号