2020年寒假学习进度第十三天
河北省疫情数据爬取
今天主要完成了河北省疫情数据的爬取,从2月8号开始爬取(附python详细代码)
from lxml import etree
import re
import requests #导入requests包
import SQL as SQL
#url = 'http://www.hebwst.gov.cn/index.do?id=397505&templet=content&cid=45'
#url ='http://www.hebwst.gov.cn/index.do?id=397291&templet=content&cid=45'
#url='http://www.hebwst.gov.cn/index.do?id=395538&templet=content&cid=326'
hrefs = []
def ULS():
urls = []
for i in range(6):
url='http://www.hebwst.gov.cn/index.do?templet=search_list&searchType=1&searchText=河北省新型冠状病毒感染的肺炎疫情情况&type=search&cid=0&page='+str(i)
print(url)
strhtml = requests.get(url)
tree = etree.HTML(strhtml.text)
urls.append(tree.xpath('//td[@class=\'sy_new_list\']/a//@href'))
print(urls)
for href1 in urls:
for href in href1:
print(href)
href = 'http://www.hebwst.gov.cn/'+ href
a = re.match(r'.*?&cid=45', href)
if (a):
if (href !='http://www.hebwst.gov.cn/index.do?id=395795&templet=content&cid=45'):
hrefs.append(href)
print(hrefs)
def info(url):
print(url)
strhtml = requests.get(url) # Get方式获取网页数据
tree = etree.HTML(strhtml.text)
text=tree.xpath('//p//text()')
text[0]=re.sub(r'\u3000','',text[0])
print(text)
#新增
date=re.findall(r"(.+?日)", text[0])
print("时间",date)
xin_que_num=re.findall(r"新增确诊病例(.+?例)", text[0])
mid = text[0].split("其中", 1)[1]
num=len(mid.split("其中", 1))
if num>1:
mid=mid.split("其中", 1)[0]
xin_shi_num=re.findall(r"[,,、](.+?市)(.+?例)", mid)
xin_chu_num = re.findall(r"新增治愈出院病例(.+?例)", text[0])
xin_yi_num = re.findall(r"新增疑似病例(.+?例)", text[0])
print("新增确诊病例",xin_que_num)
print("详细新增确诊病例\n",xin_shi_num)
print("新增治愈出院病例",xin_chu_num)
print("新增疑似病例\n",xin_yi_num)
#确诊
que_num=re.findall(r"累计报告确诊病例(.+?例)", text[1])
si_num=re.findall(r"例,其中死亡(.+?例)", text[1])
zhong_num=re.findall(r",现有重症病例(.+?例)", text[1])
yu_num=re.findall(r",累计治愈出院(.+?例)", text[1])
print("累计确诊病例",que_num)
print("死亡病例",si_num)
print("重症病例",zhong_num)
print("出院病例",yu_num)
#详细
que_xi_num=[]
si_xi_num=[]
zhong_xi_num=[]
chu_xi_num=[]
num=len(text[1].split("确诊病例中",1))
if num>1:
mid = text[1].split("确诊病例中", 1)[1]
num = len(mid.split("死亡病例中",1))
if num > 1:
que=mid.split("死亡病例中",1)[0]
que_xi_num = re.findall(r"[,、](.+?市)(.+?例)", que)
si=mid.split("死亡病例中",1)[1]
mid=si
num = len(mid.split("重症病例中", 1))
if num > 1:
si=mid.split("重症病例中",1)[0]
si_xi_num = re.findall(r"[,、](.+?市)(.+?例)", si)
zhong=mid.split("重症病例中",1)[1]
mid=zhong
num = len(mid.split("出院病例中", 1))
if num > 1:
zhong=mid.split("出院病例中",1)[0]
zhong_xi_num = re.findall(r"[,、](.+?市)(.+?例)", zhong)
chu=mid.split("出院病例中",1)[1]
chu_xi_num = re.findall(r"[,、](.+?市)(.+?例)", chu)
else:
zhong_xi_num = re.findall(r"[,、](.+?市)(.+?例)", zhong)
else:
si_xi_num = re.findall(r"[,、](.+?市)(.+?例)", si)
print("详细确诊病例",que_xi_num)
print("详细死亡病例",si_xi_num)
print("详细重症病例",zhong_xi_num)
print("详细出院病例",chu_xi_num)
#疑似
yisi_num=re.findall(r"疑似病例(.+?例)", text[1])
print("疑似病例",yisi_num)
#密切接触者 接触医学观察 正在隔离
miqie_num=re.findall(r"密切接触者(.+?人)", text[2])
jie_num=re.findall(r"解除隔离医学观察(.+?人)", text[2])
guan_num=re.findall(r"现有(.+?人)", text[2])
print("密切接触者",miqie_num)
print("接触医学观察",jie_num)
print("现有医学观察人数",guan_num)
SQL.insert_province(date[0], "".join(xin_que_num),"".join(xin_chu_num), "".join(xin_yi_num), "".join(que_num), "".join(si_num), "".join(zhong_num), "".join(yu_num),"".join(yisi_num), "".join(miqie_num), "".join(jie_num), "".join(guan_num), url)
for mid_value in que_xi_num:
City=mid_value[0]
Num=mid_value[1]
print("CITY:",City)
print("num:",Num)
SQL.insert_city(date[0],City,Num,url)
# ,,,,Ur
for mid_value in xin_shi_num:
City=mid_value[0]
Num=mid_value[1]
print("CITY:",City)
print("num:",Num)
SQL.update_db(City,"New_Confirmed_num",Num)
for mid_value in si_xi_num:
City=mid_value[0]
Num=mid_value[1]
print("CITY:",City)
print("num:",Num)
SQL.update_db(City,"Dead_num",Num)
for mid_value in zhong_xi_num:
City=mid_value[0]
Num=mid_value[1]
print("CITY:",City)
print("num:",Num)
SQL.update_db(City,"Zhong_num",Num)
for mid_value in chu_xi_num:
City=mid_value[0]
Num=mid_value[1]
print("CITY:",City)
print("num:",Num)
SQL.update_db(City,"Cured_num",Num)
if __name__ =='__main__':
#ULS()
#SQL.delete_db("hebei_info")
#SQL.delete_db("hebei_city_info")
#for url in hrefs:
#info(url)
info('http://wsjkw.hebei.gov.cn/content/content_14/398316.jhtml')
import pymysql
db = pymysql.connect(host='localhost',
port=3306,
user='root',
passwd='',
db='yiqing')
def select_db():
'''查询数据库'''
# 打开数据库连接
sql = "select * from blog_info"
# 使用 cursor() 方法创建一个游标对象cur
cur = db.cursor()
# 使用 execute() 方法执行 SQL 查询
cur.execute(sql)
# 使用 fetchall() 方法获取查询结果
data = cur.fetchall()
# print(data) # 取出对应的psw值
# 关闭数据库连接
#db.close()
return data
def delete_db(table):
'''删除操作'''
# 使用cursor()方法获取操作游标
cur = db.cursor()
sql_delete = "delete from "+table+" "
try:
cur.execute(sql_delete) # 执行
# 提交
db.commit()
except Exception as e:
print("操作异常:%s" % str(e))
# 错误回滚
db.rollback()
#finally:
#db.close()
def update_db(city,valuename,value):
'''3.更新操作'''
# 使用cursor()方法获取操作游标
cur = db.cursor()
sql_update = "update hebei_city_info set "+valuename+"='"+value+"' where city='"+city+"'"
try:
cur.execute(sql_update) # 执行sql
# 提交
db.commit()
except Exception as e:
# 错误回滚
print("错误信息:%s" % str(e))
db.rollback()
#finally:
#db.close()
def insert_city(Date, City, Confirmed_num,Url):
'''插入操作'''
# 使用cursor()方法获取操作游标
cur = db.cursor()
print(Confirmed_num)
print(Date)
print(City)
print(Url)
sql_insert= "insert into hebei_city_info (Date, City,Confirmed_num,Url) values('"+Date+"','"+City+"','"+Confirmed_num+"','"+Url+"')"
print(sql_insert)
try:
cur.execute(sql_insert)
# 提交
db.commit()
except Exception as e:
print("错误信息:%s" % str(e))
# 错误回滚
db.rollback()
#finally:
#db.close()
def insert_province(Date, New_Confirmed_num, New_Cured_num,New_Yisi_num,Confirmed_num,Dead_num,Zhong_num,Cured_num,Yisi_num,Miqie_num,None_Guan_num,Guan_num,Url):
'''插入操作'''
# 使用cursor()方法获取操作游标
cur = db.cursor()
sql_insert= "insert into hebei_info (Date, New_Confirmed_num, New_Cured_num,New_Yisi_num,Confirmed_num,Dead_num,Zhong_num,Cured_num,Yisi_num,Miqie_num,None_Guan_num,Guan_num,Url) values('"+Date+"','"+New_Confirmed_num+"','"+New_Cured_num+"','"+New_Yisi_num+"','"+Confirmed_num+"','"+Dead_num+"','"+Zhong_num+"','"+Cured_num+"','"+Yisi_num+"','"+Miqie_num+"','"+None_Guan_num+"','"+Guan_num+"','"+Url+"')"
print(sql_insert)
print("AAAA")
try:
cur.execute(sql_insert)
# 提交
db.commit()
except Exception as e:
print("错误信息:%s" % str(e))
# 错误回滚
db.rollback()
#finally:
#db.close()
if __name__ == "__main__":
values=select_db()
i=0
for value in values:
i=i+1
print(value[2])
a = select_db()[0][0]
print("查询结果:%s" %str(a))
#删除
#delete_db()
#修改
#update_db()
#insert_db()
数据库爬取数据详情:


浙公网安备 33010602011771号