2020年3月10日 Python爬取网络数据导入数据库

一、目标

 

 

二、PSP表格

 

 

 

 

 

 

三、源程序代码

Python部分

 

from os import path
import requests
from bs4 import BeautifulSoup
import json
import pymysql
import numpy as np
import time
from _ast import Try

url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'  #请求地址
headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
response =  requests.get(url,headers = headers)  #发送网络请求
#print(response.content.decode('utf-8'))#以字节流形式打印网页源码
content = response.content.decode('utf-8')
#print(content)
soup = BeautifulSoup(content, 'html.parser')
listA = soup.find_all(name='script',attrs={"id":"getAreaStat"})
#世界确诊
listB = soup.find_all(name='script',attrs={"id":"getListByCountryTypeService2"})
#listA = soup.find_all(name='div',attrs={"class":"c-touchable-feedback c-touchable-feedback-no-default"})
account = str(listA)
world_messages = str(listB)[87:-21]
messages = account[52:-21]
messages_json = json.loads(messages)
world_messages_json = json.loads(world_messages)
valuesList = []
cityList = []
worldList = []
localtime = time.localtime(time.time())
L=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(L)
for i in range(len(messages_json)):
    #value = messages_json[i]
    #value = (messages_json[i].get('provinceName'),messages_json[i].get('provinceShortName'),messages_json[i].get('currentConfirmedCount'),messages_json[i].get('confirmedCount'),messages_json[i].get('suspectedCount'),messages_json[i].get('curedCount'),messages_json[i].get('deadCount'),messages_json[i].get('comment'),messages_json[i].get('locationId'))
    value = (messages_json[i].get('provinceName'),messages_json[i].get('confirmedCount'),messages_json[i].get('curedCount'),messages_json[i].get('deadCount'),messages_json[i].get('locationId'))
    valuesList.append(value)
    cityValue = messages_json[i].get('cities')
    #print(cityValue) 一个省内没有划分开的值
    for j in range(len(cityValue)):
        #cityValueList = (cityValue[j].get('cityName'),cityValue[j].get('currentConfirmedCount'),cityValue[j].get('confirmedCount'),cityValue[j].get('suspectedCount'),cityValue[j].get('curedCount'),cityValue[j].get('deadCount'),cityValue[j].get('locationId'),messages_json[i].get('provinceShortName'))
        cityValueList = (messages_json[i].get('provinceName'),cityValue[j].get('cityName'),cityValue[j].get('confirmedCount'),cityValue[j].get('curedCount'),cityValue[j].get('deadCount'),cityValue[j].get('locationId'))
        #print(cityValueList)  省份内各个城市的值
        cityList.append(cityValueList)
    
#print(cityList)  #城市
#print(valuesList)  #省份
db=pymysql.connect("localhost","root","123456","echart_yiqing", charset='utf8')
cursor = db.cursor()
     
sql_city="insert into info_copy (Province,City,Confirmed_num,Cured_num,Dead_num,Code,Date) values (%s,%s,%s,%s,%s,%s,'"+L+"')"
sql_province="insert into info_copy (Province,Confirmed_num,Cured_num,Dead_num,Code,Date) values (%s,%s,%s,%s,%s,'"+L+"')"
#print(sql)
 
value_tuple= tuple(valuesList)
city_tuple=tuple(cityList)
 
try:
    cursor.executemany(sql_province,valuesList)
    cursor.executemany(sql_city,city_tuple)
    db.commit()
except:
    print('执行失败,进入回调4')
    db.rollback()   
     
     
     
db.close()     

其他部分用的是上次的部分就不写了,Python的部分主要就是用request来向网页发送请求然后对网页里面的标签进行解析,找到自己需要的信息获取就行,自我感觉有点类似于servlet中的request获取信息的方式。另外由于搜索的那个页面中的信息没有日期,所以在程序中加入了time语句来获取系统的时间,在导入数据库的时候一同导入。

 

四、效果图

 

 

 

 

五、心得

   感觉这一次的编程之后,发现自己有些不自信,同时对自己掌握情况也不是很了解,我给自己估计的时间是5个小时,实际完成的时间是4个小时多一丢丢。总的来说还可以,但是学习Python爬取数据的时候我发现别人总能找到比我棒的资源,需要多向别人取取经。中间有的那些灌水视频浪费了很多时间。另外Python确实是高级语言,那么复杂的程序在java得写好多,Python只需要写几行就ok了,不过java写习惯了,Python这种弱编程写的有些些难受,就比如for循环分不清他的范围

posted @ 2020-03-11 17:27  酸奶面包  阅读(459)  评论(0)    收藏  举报