简单入门-美团主页词云

 1 #1.爬取原始数据
 2 # 导入requests库
 3 import requests as rs
 4 #获取网页源代码 修改headers通过基本猫眼发爬虫审查
 5 headers = {
 6     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
 7 }
 8 url='https://maoyan.com'
 9 resp = rs.get(url, headers=headers)
10 print(resp.status_code)
11 print(type(resp))
12 print('----------------')
13 #设置编码格式避免乱码
14 resp.encoding='utf-8'
15 # 保存网页源代码
16 webText=resp.text
17 open("source.txt", "w").write(webText)
18 print('----souce download------------')
19 
20 
21 #2. 使⽤用BeautifulSoup进⾏数据解析
22 from bs4 import BeautifulSoup
23 # HtmlParser,是解析Html的一个工具。python自带的,用来解析数据
24 soup = BeautifulSoup(resp.text, 'html.parser')
25 #获取blog文本,保存
26 webContent=soup.text
27 file=open("webContent.txt", "w").write(webContent)
28 print('----written---------')
29 
30 
31 #3. 使⽤WordCloud库生成词云;使用matplotlib库进行可视化
32 from wordcloud import WordCloud
33 import matplotlib.pyplot as plt
34 #读出文本
35 text= open("webContent.txt").read().replace("","").replace("","").replace\
36     ("想看","").replace("",'').replace("","").replace("预告片","").replace\
37     ("","").replace("","").replace("","").replace("","").replace\
38     ("上映","").replace("猫眼电影","").replace("maoyan","")
39 print(text)
40 #print(type(text))
41 #设置词云字体格式
42 font = r'/simhei.ttf'
43 #调用WordCloud()词云生产函数
44 wc = WordCloud(font_path=font, width=1400, height=1400, margin=2).generate(text)
45 #imshow()函数负责对图像进行处理
46 plt.imshow(wc)
47 #plt.axis("off")
48 #show()函数负责对图像进行展示
49 plt.show()
50 #词云保存为图片
51 wc.to_file('webToWordCloud.png')  # 把词云保存下来
52 print('----pic saved---------')

posted @ 2021-09-07 20:46  雪飞就飞  阅读(148)  评论(0编辑  收藏  举报