一个完整的大作业
1.选取一个自己感兴趣的主题,我选取了搜狐新闻
登录网站:http://news.sohu.com/

2.在浏览器中按F12进入查看器
 
3.网络上爬取相关的数据,并输出结果
import requests
from bs4 import BeautifulSoup
url = 'http://news.sohu.com/'
res = requests.get(url)
res.encoding = 'UTF-8'
soup = BeautifulSoup(res.text, 'html.parser')
for news in soup.select('.list16'):
    li = news.select('li')  
    if len(li) > 0:      
        title = li[0].text       
        href = li[0].select('a')[0]['href']
        print(title, href)
 
4.进行文本分析,生成词云
from os import path 
from scipy.misc import imread   import jieba import sys import matplotlib.pyplot as plt from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator   text = open('D:\\sohu.txt').read() wordlist = jieba.cut(text) wl_space_split = " ".join(wordlist) d = path.dirname(__file__) nana_coloring = imread(path.join(d, "D:\\0.jpg")) my_wordcloud = WordCloud( background_color = 'white',                               mask = nana_coloring,                                    max_words = 3000,                                       stopwords = STOPWORDS,                             max_font_size = 70,                                   random_state = 50,            )     text_dict = {   'you': 2993,   'and': 6625,   'in': 2767,   'was': 2525,   'the': 7845,}my_wordcloud = WordCloud().generate_from_frequencies(text_dict)image_colors = ImageColorGenerator(nana_coloring) my_wordcloud.recolor(color_func=image_colors) plt.imshow(my_wordcloud)   plt.axis("off")            plt.show()    my_wordcloud.to_file(path.join(d, "cloudimg.png")) 5.结果
 
 
 
                    
                
 
                
            
         浙公网安备 33010602011771号
浙公网安备 33010602011771号