一个完整的大作业
1.选一个自己感兴趣的主题。
我选取动漫资讯进行爬虫操作,爬取网站‘’http://news.dmzj.com/‘’。
2.网络上爬取相关的数据。
爬取此网页中的新闻标题,来源和时间。
import requests from bs4 import BeautifulSoup url = 'http://news.dmzj.com/' res = requests.get(url) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') for news in soup.select('.briefnews_con_li'): if len(news.select('h3'))>0: title=news.select('h3')[0].text url=news.select('a')[0]['href'] resd=requests.get(url) resd.encoding='utf-8' soupd=BeautifulSoup(resd.text,'html.parser') time=soupd.select('.data_time')[0].text source=soupd.select('.data_from')[0].text #p = soupd.select('.news_content_con')[0].text print(title,url,time,source)
3.进行文本分析,生成词云。
import requests import jieba from bs4 import BeautifulSoup import re url = 'http://news.dmzj.com/' res = requests.get(url) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') for news in soup.select('.briefnews_con_li'): if len(news.select('h3'))>0: title=news.select('h3')[0].text url=news.select('a')[0]['href'] resd=requests.get(url) resd.encoding='utf-8' soupd=BeautifulSoup(resd.text,'html.parser') p = soupd.select('.news_content_con')[0].text #print(p) break words = jieba.lcut(p) ls = [] counts = {} for word in words: ls.append(word) if len(word) == 1: continue else: counts[word] = counts.get(word,0)+1 items = list(counts.items()) items.sort(key = lambda x:x[1], reverse = True) for i in range(10): word , count = items[i] print ("{:<5}{:>2}".format(word,count)) from wordcloud import WordCloud import matplotlib.pyplot as plt cy = WordCloud(font_path='msyh.ttc').generate(p)#wordcloud默认不支持中文,这里的font_path需要指向中文字体 plt.imshow(cy, interpolation='bilinear') plt.axis("off") plt.show()
4.对文本分析结果解释说明
对文本进行分词,将词汇写入词云中让人更好地了解文本的主要内容和主题。
5.完整代码
import requests from bs4 import BeautifulSoup from datetime import datetime import pandas import sqlite3 def getdetail(url): resd=requests.get(url) resd.encoding='utf-8' soupd=BeautifulSoup(resd.text,'html.parser') news={} news['url']=url news['title']=soupd.select('h1')[0].text news['time']=soupd.select('.data_time')[0].text news['source']=soupd.select('.data_from')[0].text #news['p'] = soupd.select('.news_content_con')[0].text return(news) def onepage(pageurl): res = requests.get(pageurl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') newsls = [] for news in soup.select('.briefnews_con_li'): if len(news.select('h3'))>0: newsls.append(getdetail(news.select('a')[0]['href'])) return(newsls) #print(onepage('http://news.dmzj.com/')) newstotal = [] dmurl='http://news.dmzj.com/' newstotal.extend(onepage(dmurl)) res = requests.get(dmurl) res.encoding= 'utf-8' soup=BeautifulSoup(res.text,'html.parser') for i in range(2,3): listurl='http://news.dmzj.com/p{}.html'.format(i) newstotal.extend(onepage(listurl)) df = pandas.DataFrame(newstotal) df.to_excel('dmnews.xlsx') with sqlite3.connect('dmnewsdb.sqlite') as db: df.to_sql('dmnewsdb8',con = db)