获取NBA中国官方网站中的新闻—解读:骑士三连败的原因何在?
作业要求
选一个自己感兴趣的主题。
网络上爬取相关的数据。
进行文本分析,生成词云。
对文本分析结果解释说明。
写一篇完整的博客,附上源代码、数据爬取及分析结果,形成一个可展示的成果。
1、使用360极速浏览器打开网页“http://china.nba.com/a/20171030/030246.htm”,在空白地方点击鼠标右键调出查看源代码选项。
可以通过网页源代码查看标题的代码,可以看出每条消息的标题与链接
2.直接获取新闻所有的内容
爬取到数据之后就对数据进行分析和统计,代码如下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
import jieba output = open ( 'D:\\dd.txt' , 'a' ,encoding = 'utf-8' ) txt = open ( 'D:\\bb.txt' , "r" ,encoding = 'utf-8' ).read() #去除一些无意义的词汇后 ex = { '无非' , '他的' , '汗水' , '没有' } ls = [] words = reb.dict(txt) counts = {} for word in words: ls.append(word) if len (word) = = 1 or word in ex: continue else : counts[word] = counts.get(word, 0 ) + 1 #for word in ex: # del(word) items = list (counts.items()) items.sort(key = lambda x:x[ 1 ], reverse = True ) for i in range ( 100 ): word , count = items[i] print ( "{:<10}{:>5}" . format (word,count)) output.write( "{:<10}{:>5}" . format (word,count) + '\n' ) output.close() |
import requests from bs4 import BeautifulSoup import reb def getHTMLText(url): try: r = requests.get(url, timeout = 30) r.raise_for_status() r.encoding = 'bg2312' return r.text except: return "" def getContent(url):#读取信息 html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser")
title = soup.select("div.hd > h1")#标题 print(title[0].get_text())
time = soup.select("div.a_Info > span.a_time")#时间 print(time[0].string) paras = soup.select("div.Cnt-Main-Article-QQ > p.text")#内容 article = { 'Title' : title[0].get_text(), 'Time' : time[0].get_text(), 'Paragraph' : paras, 'Author' : author[0].get_text() } print(article)
def main(): url = "http://china.nba.com/a/20171030/030246.htm" getContent(url); main()
数据做成词云
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
#coding:utf-8 import jieba from wordcloud import WordCloud import matplotlib.pyplot as plt text = open ( "D:\\cc.txt" , 'r' ,encoding = 'utf-8' ).read() print (text) wordlist = jieba.cut(text,cut_all = True ) wl_split = "/" .join(wordlist) mywc = WordCloud().generate(text) plt.imshow(mywc) plt.axis( "off" ) plt.show() |