import jieba
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import jieba.analyse
from pyquery import PyQuery
santi_text = open('./santi.txt', 'r', encoding='utf-8').read() #读取本地文档
jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数
jieba.load_userdict('./userdict.txt')#加载外部 用户词典
# 创建停用词list
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
# 对句子去除停用词
def movestopwords(sentence):
stopwords = stopwordslist('./stop_words.txt') # 这里加载停用词的路径
santi_words =[x for x in sentence if len(x) >1 and x not in stopwords]
return santi_words
def main():
words = jieba.cut(PyQuery(santi_text).text()) #去除HTML标签
word_list = movestopwords(words) # 去除停用词
words_split = " ".join(word_list) #列表解析为字符串
print('以下是tf-tdf算法-------------------------------------------------')
keywords_tf = jieba.analyse.extract_tags(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) # tf-tdf算法
for item in keywords_tf:
print(item[0],item[1])
print('以下是textrank算法-------------------------------------------------')
keywords_rank = jieba.analyse.textrank(words_split, topK=100, withWeight=True,allowPOS=('ns', 'n', 'vn', 'v')) #textrank算法
for item in keywords_rank:
print(item[0],item[1])
print('以下是纯词频统计-------------------------------------------------')
mycount = Counter(word_list) # 统计词频
for key, val in mycount.most_common(100): # 有序(返回前10个)
print(key, val)
#alice_mask = np.array(Image.open("./zhihu.png")) #遮罩
wc = WordCloud(
# width=800,
# height=600,
background_color="#000000", # 设置背景颜色
max_words=50, # 词的最大数(默认为200)
max_font_size=400, # 最大字体尺寸
min_font_size=10, # 最小字体尺寸(默认为4)
#colormap='bone', # string or matplotlib colormap, default="viridis"
random_state=42, # 设置有多少种随机生成状态,即有多少种配色方案
#mask=plt.imread("./zhihu.png"), # 读取遮罩图片!!
#mask=alice_mask, #设置遮罩
font_path='./SimHei.ttf'
)
my_wordcloud = wc.generate(words_split) #按词频生成词云
plt.imshow(my_wordcloud) #展示词云
plt.axis("off") #去除横纵轴
plt.show()
wc.to_file('zzz.png') # 保存图片文件
if __name__ == '__main__':
main()