Python高级应用程序设计任务要求
Python高级应用程序设计任务要求
用Python实现一个面向主题的网络爬虫程序,并完成以下内容:
(注:每人一题,主题内容自选,所有设计内容与源代码需提交到博客园平台)
一、主题式网络爬虫设计方案(15分)
1.主题式网络爬虫名称
名称:爬取腾讯科技体育频道新闻
2.主题式网络爬虫爬取的内容与数据特征分析
本次爬虫主要爬取腾讯科技频道新闻的标题,正文和其他相关信息
3.主题式网络爬虫设计方案概述(包括实现思路与技术难点)
本次设计方案主要依靠request库和BeautifulSoup库对腾讯新闻网站访问并采集,最后以txt格式将数据保存在本地。
技术难点主要包括对页面的分析、对数据的采集和对数据的持久化操作。
二、主题页面的结构特征分析(15分)
1.页面解析(体育新闻)




三、网络爬虫程序设计(60分)
爬虫程序主体要包括以下各部分,要附源代码及较详细注释,并在每部分程序后面提供输出结果的截图。
程序代码# 导入requests库和bs4 库
import requests
from bs4 import BeautifulSoup
# 获取新闻页面源码的函数
def getNews(url):
# 判断异常产生
try:
r = requests.get(url, timeout=30)
# 爬取出错的话抛出异常
r.raise_for_status()
# 返回页面源码
return r.text
except:
return "爬取页面出现错误,停止爬取"
def getContent(html, url):
soup = BeautifulSoup(html, "html.parser")
# 获取新闻页面标题
title = soup.select("div.LEFT > h1")
# 打印查看标题是否爬取
print('新闻标题:' + title[0].get_text())
# 获取新闻编辑作者
author = soup.select("div.content-article > p.one-p > strong")
print('编辑作者:' + author[1].get_text())
# 新闻正文
text = soup.select("div.content-article > p.one-p")
n=0
for p in text:
if n > 1:
print(p.get_text())
n = n + 1
# 写入文件
fo = open("text.txt", "w+", encoding='utf-8')
# 写入新闻标题
fo.writelines(title[0].get_text() + "\n")
# 写入新闻编辑作者
fo.writelines(author[1].get_text() + '\n')
# 用于计数行数
n=0
for p in text:
if n > 1:
fo.writelines(p.get_text() + "\n\n")
n = n + 1
# 关闭文件流
fo.close()
def main():
url = "https://new.qq.com/omn/20191215/20191215A0K94W00.html"
html = getNews(url),
# 执行爬取函数
getContent( html,url)
# 执行main函数
main()
运行结果:

数据持久化

代码2:
import numpy as np
import pandas as pd
import requests as req
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
# 获取体育新闻并存储到文件中
def getNewsHtml(url):
# 爬取过程中可能会出现爬取失败的情况,一旦失败停止爬取
try:
r = req.get(url, headers={'user-agent': 'Mozilla/5.0'})
r.raise_for_status()
html = r.text
return html
except:
return "Error"
# 爬取新闻信息
def getNewDate(html):
# 使用BeautifulSoup类解析网页源码
soup = BeautifulSoup(html, "html.parser")
# 获取新闻标题
title = soup.select("div.LEFT > h1")
# 打印新闻标题
print(title[0].text)
# 获取新闻发布时间
mata = soup.find_all("meta", attrs={"name": "apub:time"})[0].attrs["content"]
print(mata)
# 获取新闻主题内容
cntents = soup.select("div.content-article > p.one-p")
text = ""
n = 0
# 循环遍历contents中的p标签
for p in cntents:
if n > 1:
# 拼接内容
text = text+p.text
n = n + 1
return [title[0].text, text, mata]
# 循环爬取urls数组中的路径
def forNewUrl(urls):
List = []
for url in urls:
# 爬取页面源码
html = getNewsHtml(url)
# 返回新闻页面数据集合
newdata = getNewDate(html)
List.append(newdata)
return List
# 用来保存新闻数据
def saveNewDate(ListNewsDate,newPath):
writer = pd.ExcelWriter(newPath)
# 将数据转为DataFrame格式,用来存储在excel表格中
df= pd.DataFrame(ListNewsDate,columns=["NewTilte","NewContent","createtime"])
#
df.to_excel(writer, sheet_name="ListNewsDate1")
writer.save()
# 爬取新闻的页面ur路径
# url = "https://new.qq.com/rain/a/SPO2019121602087000"
urls = ["https://new.qq.com/rain/a/SPO2019121602087000",
"https://new.qq.com/omn/20191218/20191218A0NMFX00.html",
"https://new.qq.com/omn/20191218/20191218A0OTX800.html",
"https://new.qq.com/omn/20191218/20191218A0JR4H00.html",
"https://new.qq.com/omn/20191218/20191218A0OO9M00.html",
"https://new.qq.com/omn/20191218/20191218A0JVAA00.html",
"https://new.qq.com/omn/20191218/20191218A0HDXZ00.html",
"https://new.qq.com/omn/20191218/20191218A0F26Y00.html",
"https://new.qq.com/omn/20191218/20191218A0F1T500.html",
"https://new.qq.com/omn/20191218/20191218A0ENJ800.html",
"https://new.qq.com/omn/20191218/20191218A0E85400.html",
"https://new.qq.com/rain/a/20191218A0CEBN00",
"https://new.qq.com/omn/20191218/20191218A0CAJB00.html",
"https://new.qq.com/omn/20191218/20191218A0BPK400.html",
"https://new.qq.com/omn/20191218/20191218A0BNTG00.html",
"https://new.qq.com/rain/a/20191218A0BNI300",
"https://new.qq.com/omn/20191218/20191218A0BM8G00.html",
"https://new.qq.com/omn/20191218/20191218A0BFS000.html",
"https://new.qq.com/omn/20191218/20191218A0B3AT00.html",
"https://new.qq.com/rain/a/20191218A0B0CI00",
"https://new.qq.com/omn/20191218/20191218A0AUGQ00.html",
"https://new.qq.com/omn/20191218/20191218A0A42300.html"
,"https://new.qq.com/omn/20191218/20191218A09YES00.html",
"https://new.qq.com/omn/20191218/20191218A09XPJ00.html",
"https://new.qq.com/omn/20191218/20191218A09MW500.html",
"https://new.qq.com/omn/20191218/20191218A09AGO00.html",
"https://new.qq.com/omn/20191218/20191218A08E6V00.html",
"https://new.qq.com/omn/20191218/20191218A067ZI00.html",
"https://new.qq.com/omn/20191218/20191218A046ZD00.html",
"https://new.qq.com/omn/20191218/20191218A0424P00.html"]
def run():
ListNewsDate = forNewUrl(urls)
saveNewDate(ListNewsDate, "ListNewsDate.xlsx")
#执行代码run()
终端运行结果:

数据持久化文件ListNewsDate.xlsx

数据清洗代码:
# 解决seaborn中文乱码问题 sns.set_style('whitegrid',{'font.sans-serif':['simhei','Arial']}) # 读取文件 ListNewsDate = pd.read_excel("ListNewsDate.xlsx", sheet_name="ListNewsDate1") print(ListNewsDate.head(5)) ListNewsDate['createtime']

# 数据清洗 hour = [] date = [] for i in range(0,30): ListNewsDate['createtime'][i] = ListNewsDate['createtime'][i].split(":")[0] print(ListNewsDate['createtime'][i].split(" ")[0], ListNewsDate['createtime'][i].split(" ")[1]) hour.append(int(ListNewsDate['createtime'][i].split(" ")[1])) date.append(ListNewsDate['createtime'][i].split(" ")[0]) # 将清洗后的数据存入ListNewsDate中 ListNewsDate['hour'] = hour ListNewsDate['date'] = date
# 打印在12/18号的各个时间段
print(ListNewsDate['hour'])

# 新闻时间分布图 sns.distplot(ListNewsDate['hour'])

# 统计各个时间段新闻发布次数
sns.countplot(ListNewsDate['hour'])

sns.catplot(x="hour", y="date", data=ListNewsDate)

四、结论(10分)
1.经过对主题数据的分析与可视化,可以得到哪些结论?
通过对页面结构的分析,可以得到新闻的相关信息。
由于爬取的是新闻内容,且此新闻页面数据是动态加载,有很多地方无法爬取
通过数据可视化看出,在中午时间和晚上8点新闻发布数量会偏多,其他时间较少
2.对本次程序设计任务完成的情况做一个简单的小结。
通过这次任务,基本实现使用python把想要的数据爬取下来并且保存在本地,要自己慢慢分析页面结构,一步步的自己操作和平时上课不一样。将中国mooc爬虫教程基本使用
浙公网安备 33010602011771号