相关截图



练习代码
import wordcloud
import pandas as pd
import jieba
import matplotlib.pyplot as plt
from nltk.corpus import brown
font_path = 'C:\Windows\Fonts\msyh.ttc'
text = 'this is shanghai, 李帅, 郭靖, 成龙, 哀牢山 三十六剑'
'''
wc = wordcloud.WordCloud(
font_path=font_path,
max_font_size=300,
width=360,
height=180,
mode='RGBA',
background_color=None, #透明的词云
)
cloudobj = wc.generate(text)
# cloudobj.show()
print(cloudobj)
# 展示词云图
plt.imshow(cloudobj)
# 关闭坐标轴,否则很丑
plt.axis('off')
plt.show()
# 保存高清图片
cloudobj.to_file('词云.png')
'''
raw = pd.read_table('./金庸-射雕英雄传txt精校版.txt',names=['txt'],encoding='GBK')
# print(raw)
# 加入章节标识
# 章节判断用变量预处理
def m_head(tmpstr):
return tmpstr[:1] #取第一个字
def m_mid(tmpstr):
return tmpstr.find("回 ")
# 用apply函数将下面的属性加入到对应列
raw['head'] = raw.txt.apply(m_head)
raw['mid'] = raw.txt.apply(m_mid)
raw['len'] = raw.txt.apply(len)
# 章节判断
chapnum = 0
for i in range(len(raw)):
if raw['head'][i] == "第" and raw['mid'][i] >0 and raw['len'][i]<30:
chapnum += 1
if chapnum >= 40 and raw['txt'][i] == "附录一:成吉思汗家族":
chapnum=0
raw.loc[i,'chap'] = chapnum
# 删除临时变量
del raw['head']
del raw['mid']
del raw['len']
# 段落聚合 根据章节聚合
rawgrp = raw.groupby('chap')
chapter = rawgrp.agg(sum)
chapter = chapter[chapter.index != 0]
t = chapter.txt[1]
print("*"*100)
print(t)
print("*"*100)
'''
生成射雕英雄传第一章的词云
'''
# 把停用词.txt的内容读入数据框
stoplist = list(pd.read_csv('./停用词.txt',names=['w'],sep='aaa',encoding='utf-8',engine='python').w)
# print(stoplist)
# print(' '.join(stoplist))
def m_cut(intxt):
return [w for w in jieba.cut(intxt) if w not in stoplist]
cloudobj = wordcloud.WordCloud(
font_path=font_path,
width=1200,
height=800,
mode='RGBA',
background_color=None,
stopwords=stoplist,
).generate(' '.join(jieba.lcut(chapter.txt[1])))
plt.imshow(cloudobj)
plt.axis('off')
plt.show()
'''
基于分词频数绘制词云
'''
txt_freq = {'张三':100,'李四':90,'王二麻子':50}
cloudobj = wordcloud.WordCloud(
font_path=font_path,
).fit_words(txt_freq)
plt.imshow(cloudobj)
plt.axis("off")
plt.show()
'''
基于分词频数绘制射雕英雄传的词云
'''
import nltk
from nltk import FreqDist
# 去停用词
tokens = m_cut(chapter.txt[1])
# 生成完备的词条频数词典
fdist = FreqDist(tokens)
print(type(fdist))
# <class 'nltk.probability.FreqDist'>
cloudobj = wordcloud.WordCloud(
font_path=font_path,
background_color=None,
width=1600,
height=1000,
).fit_words(fdist)
plt.imshow(cloudobj)
plt.axis("off")
plt.show()
'''
词云的美化:
1,设置背景图片
Mask/遮罩
用于控制词频的整体形状
指定mask后,设置的高和宽江北忽略,遮罩形状被指定图形的形状取代。除全白的部分仍然被保留外,
其余部分会用于绘制词云。因此背景图片的画布一定要设置为白色
字体的大小,布局和颜色也会基于mask生成
必要时需要调整颜色以增强可视效果
# 基本调用方式
from scipy.misc import imread
mask = imread('背景图片')
'''
from imageio import imread
def m_cut2(intxt):
return [w for w in jieba.cut(intxt) if w not in stoplist and len(w)>1]
cloudobj = wordcloud.WordCloud(
font_path=font_path,
mask=imread('射雕背景1.png'),
mode='RGBA',
background_color=None,
).generate(' '.join(m_cut2(chapter.txt[1])))
plt.imshow(cloudobj)
plt.axis("off")
plt.show()