一个完整的大作业
1.选一个自己感兴趣的主题。
我选择最近的十九大会议进行爬虫操作,爬取网站“http://cpc.people.com.cn/19th/GB/414745/414893/index.html?_zbs_baidu_dk”

2.网络上爬取相关的数据。
爬取此网页中的新闻标题,来源和时间。
import requests
from bs4 import BeautifulSoup
url = 'http://cpc.people.com.cn/19th/GB/414745/414893/index.html?_zbs_baidu_dk'
res = requests.get(url)
res.encoding='gb2312'
soup=BeautifulSoup(res.text,'html.parser')
for news in soup.select('.focusBox'):
if len(news.select('p'))>0:
title=news.select('p')[0].text
url=news.select('a')[0]['href']
resd=requests.get(url)
resd.encoding='gb2312'
soupd=BeautifulSoup(resd.text,'html.parser')
time=soupd.select('.sou')[0].text
source=soupd.select('.sou')[0].text
#p = soupd.select('.news_content_con')[0].text
print(title,url,time,source)

3.进行文本分析,生成词云。
import jieba
from bs4 import BeautifulSoup
import re
url = '
res = requests.get(url)
res.encoding='gb2312'
soup=BeautifulSoup(res.text,'html.parser')
for news in soup.select('.focusBox'):
if len(news.select('p'))>0:
title=news.select('p')[0].text
url=news.select('a')[0]['href']
resd=requests.get(url)
resd.encoding='gb2312'
soupd=BeautifulSoup(resd.text,'html.parser')
p = soupd.select('.show_text')[0].text
#print(p)
break
words = jieba.lcut(p)
ls = []
counts = {}
for word in words:
ls.append(word)
if len(word) == 1:
continue
else:
counts[word] = counts.get(word,0)+1
items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(10):
word , count = items[i]
print ("{:<5}{:>2}".format(word,count))
from wordcloud import WordCloud
import matplotlib.pyplot as plt
cy = WordCloud(font_path='msyh.ttc').generate(p)
plt.imshow(cy)
plt.axis("off")
plt.show()

4.对文本分析结果解释说明
对文本进行分词,将词汇写入词云中让人更好地了解文本的主要内容和主题。
from bs4 import BeautifulSoup
from datetime import datetime
import pandas
import sqlite3
def getdetail(url):
resd=requests.get(url)
resd.encoding='gb2312'
soupd=BeautifulSoup(resd.text,'html.parser')
news={}
news['url']=url
news['title']=soupd.select('p')[0].text
news['time']=soupd.select('.sou')[0].text
news['source']=soupd.select('.sou')[0].text
#news['p'] = soupd.select('.show_text')[0].text
return(news)
def onepage(pageurl):
res = requests.get(pageurl)
res.encoding='gb2312'
soup=BeautifulSoup(res.text,'html.parser')
newsls = []
for news in soup.select('.focusBox'):
if len(news.select('p'))>0:
newsls.append(getdetail(news.select('a')[0]['href']))
return(newsls)
#print(onepage('
newstotal = []
dmurl='
newstotal.extend(onepage(dmurl))
res = requests.get(dmurl)
res.encoding= 'gb2312'
soup=BeautifulSoup(res.text,'html.parser')
listurl='
newstotal.extend(onepage(listurl))
df = pandas.DataFrame(newstotal)
print(df.head())
df.to_excel('mnews.xlsx')
with sqlite3.connect('mnewsdb.sqlite') as db:
df.to_sql('mnewsdb8',con = db)
浙公网安备 33010602011771号