获取全部校园新闻
1.取出一个新闻列表页的全部新闻 包装成函数。
2.获取总的新闻篇数,算出新闻总页数。
3.获取全部新闻列表页的全部新闻详情。
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime
newsurl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/'
res = requests.get(newsurl) # 返回response对象
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
def getNewDetail(pageUrl):
res = requests.get(pageUrl) # 返回response对象
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
for news in soup.select('li'):
if len(news.select('.news-list-title')) > 0:
t = news.select('.news-list-title')[0].text # 标题
a = news.select('a')[0].attrs['href'] # 链接
res = requests.get(a)
res.encoding = 'utf-8'
soupd = BeautifulSoup(res.text, 'html.parser')
content = soupd.select('#content')[0].text
description = news.select('.news-list-description')[0].text
resd = requests.get(a)
resd.encoding = 'utf-8'
soupd = BeautifulSoup(resd.text, 'html.parser')
info = soupd.select('.show-info')[0].text
d = info.lstrip('发布时间:')[:19]
dt = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')
author = info[info.find('作者:'):].split()[0].lstrip('作者:')
source = info[info.find('来源:'):].split()[0].lstrip('来源:')
photo = info[info.find('摄影:'):].split()[0].lstrip('摄影:')
print("新闻标题:", t)
print("链接:", a)
print("发布时间:", dt)
print("作者:", author)
print("来源:", source)
print("摄影:", photo)
print("描述:", description)
getClickCount(a)
print("正文:", content)
def getClickCount(a):
newsid = re.search(r"\_(.*).html", a).group(1)[-4:]
clickcounturl = ("http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80").format(newsid)
clickcount = int(requests.get(clickcounturl).text.split(".html(")[-1].lstrip("'").rstrip("');"))
print('点击次数:',clickcount)
def getpagelist(path):
res = requests.get(path) # 返回response对象
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
newsnum=int(soup.select('.a1')[0].text.rstrip('条')) #新闻总条数
if(newsnum%10==0):
totalpage=newsnum//10
else:
totalpage=newsnum//10+1 #新闻总页数
for i in range(1,totalpage):
pageUrl = path + '{}.html'.format(i)
getNewDetail(pageUrl)
getpagelist(newsurl)
4.找一个自己感兴趣的主题,进行数据爬取,并进行分词分析。不能与其它同学雷同。
# 爬取环球科技网新闻信息
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import jieba
newsurl = 'http://tech.huanqiu.com/internet/'
def sort(text):
str = '''一!“”,。?;’"',.、:\n'''
for s in str:
text = text.replace(s, ' ')
wordlist = list(jieba.cut(text))
exclude = {'这', '\u3000', '\r', '\xa0', '的', '_', ' ', '将', '在', '是', '了', '一', '还', '也', '《', '》', '(', ')'}
set2 = set(wordlist) - exclude
dict = {}
for key in set2:
dict[key] = wordlist.count(key)
dictlist = list(dict.items())
dictlist.sort(key=lambda x: x[1], reverse=True)
print("top5关键词:")
for i in range(5):
print(dictlist[i])
def getContent(url):
res = requests.get(url)
res.encoding = 'utf-8'
soup2 = BeautifulSoup(res.text, 'html.parser')
for news in soup2.select('.l_a'):
if len(news.select('.author'))>0:
author=news.select('.author')[0].text
print("作者",author)
content = soup2.select('.la_con')[0].text.rstrip('AD_SURVEY_Add_AdPos("7000531");')
print("正文:", content)
sort(content)
def getNewDetails(newsurl):
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
for news in soup.select('.item'):
# print(news)
title = news.select('a')[0].attrs['title']
a = news.select('a')[0].attrs['href']
brief = news.select('h5')[0].text.rstrip('[详细]')
time = news.select('h6')[0].text
dt = datetime.strptime(time, '%Y-%m-%d %H:%M')
print("新闻标题:", title)
print("链接:", a)
print("内容简介:", brief)
print("时间:", dt)
getContent(a)
print('\n')
# break
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
getNewDetails(newsurl)
# for total in soup.select('#pages'):
# all=int(total.select('a')[0].text.rstrip('条')) #获取总条数计算总页数
# #print(all)
# if(all%60==0):
# totalpages=all//60
# else:
# totalpages=all//60+1
# print(totalpages)
# for i in range(1,totalpages+1): #所有页面的新闻信息
# PageUrl = newsurl + '{}.html'.format(i)
# getNewDetails(PageUrl)

浙公网安备 33010602011771号