一个完整的大作业
完整的大作业
1.选一个自己感兴趣的主题。
2.网络上爬取相关的数据。
3.进行文本分析,生成词云。
4.对文本分析结果解释说明。
5.写一篇完整的博客,附上源代码、数据爬取及分析结果,形成一个可展示的成果。
1.选取感兴趣的内容:http://bbs.hh010.com/forum-162-1.html


2. 爬取相关数据
import requests
from bs4 import BeautifulSoup
import jieba
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from os import path
from scipy.misc import imread
url="http://bbs.hh010.com/forum-162-1.html"
result = requests.get(url)
sp = BeautifulSoup(result.text,'html.parser')
tag = sp.find_all('a', class_="s xst")
f = open("C:\\Users\\LiuJiXuan\\Desktop\\test.txt", "w+")
for tmp in tag:
f.write(str(tmp.string.encode('utf-8'))+"\n")
f.write(str(tmp['href'])+"\n")
f.write("---------------------------------------------------------------"+"\n\n")
f.close()
3. 进行文本分析,生成词云。
d = path.dirname(__file__)
jieba.load_userdict("jieba_dict.txt")
file=open("C:\\Users\\LiuJiXuan\\Desktop\\test.txt","r").readlines()
text=""
for s in file:
text+=" ".join(jieba.cut(s))+" "
background = imread(path.join(d,"./1.png"))
wc = WordCloud( font_path='./kaiti.ttf',#设置字体
background_color="white", #背景颜色
max_words=2000,# 词云显示的最大词数
mask=background,#设置背景图片
max_font_size=720, #字体最大值
random_state=42,
)
wc.generate(text)
image_colors = ImageColorGenerator(background)
plt.imshow(wc)
plt.axis("off")
plt.show()
wc.to_file(path.join(d, "wordcloud.png"))
4.文本分析(先通过爬取,把文章的标题和链接写入test.txt中,再用jieba模块进行文本分析,通过加载默认和自定义词典进行分词截取)
test.txt:

自定义词典(jieba_dict.txt):

5.生成词云

程序完整源代码:
import requests
from bs4 import BeautifulSoup
import jieba
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
from os import path
from scipy.misc import imread
url="http://bbs.hh010.com/forum-162-1.html"
result = requests.get(url)
sp = BeautifulSoup(result.text,'html.parser')
tag = sp.find_all('a', class_="s xst")
f = open("C:\\Users\\LiuJiXuan\\Desktop\\test.txt", "w+")
for tmp in tag:
f.write(str(tmp.string.encode('utf-8'))+"\n")
# f.write(str(tmp['href'])+"\n")
# f.write("---------------------------------------------------------------"+"\n\n")
f.close()
d = path.dirname(__file__)
jieba.load_userdict("jieba_dict.txt")
file=open("C:\\Users\\LiuJiXuan\\Desktop\\test.txt","r").readlines()
text=""
for s in file:
text+=" ".join(jieba.cut(s))+" "
background = imread(path.join(d,"./1.png"))
wc = WordCloud( font_path='./kaiti.ttf',
background_color="white",
max_words=2000,
mask=background,
max_font_size=720,
random_state=42,
)
wc.generate(text)
image_colors = ImageColorGenerator(background)
plt.imshow(wc)
plt.axis("off")
plt.show()
wc.to_file(path.join(d, "wordcloud.png"))
浙公网安备 33010602011771号