大二下学期第一次结对作业（第一阶段）

今日主要学习了使用python爬取数据并存入数据库的基本操作：

操作环境是在Jupyter Notebook，它是以网页形式打开的直接操作代码，运行代码并可以写说明文档。

urllib的基本使用

from urllib import request
#添加header信息 最基本的反扒措施
url = "http://www.bilibili.com/"
header = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"
}
req = request.Request(url,headers=header)
res = request.urlopen(req)#获取相应
print(res.info())#响应头
print(res.getcode())#返回状态码 2xx正常访问 3xx发生了从重定向，4xx访问资源有问题,5xx服务器问题
print(res.geturl())#返回相应地址
html =res.read()
html=html.decode("utf-8")
print(html)

request基本使用

import requests
url = "http://www.bilibili.com/"
res=requests.get(url)
print(res.encoding)
print(res.headers)#里面没有 Content—Type  encoding=utf-8 否则如果charset已设置为准，否则就是ISO-88599-1
print(res.url)
print(res.text)
print(res.status_code)

beautifulsoup4 解析内容

from bs4 import BeautifulSoup
import requests
url = "http://wsjkw.sc.gov.cn/scwsjkw/gzbd/fyzt.shtml"
res = requests.get(url)
res.encoding="utf-8"
html = res.text
soup=BeautifulSoup(html)
soup.find("h2").text
a = soup.find("a")
print(a)
print(a.attrs)
print(a.attrs["href"])

url_new = "http://wsjkw.sc.gov.cn"+a.attrs["href"]
url_new

re解析

import re
res =requests.get(url_new)
res.encoding="utf-8"
soup=BeautifulSoup(res.text)
context=soup.select("p")
text=context[1].text
print(text)
patten="确诊病例(\d+)例"
res = re.search(patten,text)
print(res)
print(res.groups())
print(res.group(0))
print(res.group(1))

爬取疫情数据

import requests
import json
url="https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
res=requests.get(url)
d=json.loads(res.text)
data_all=json.loads(d["data"])
print(data_all)
print(data_all["areaTree"][0].keys())
print(data_all["areaTree"][0]["name"])
print(data_all["areaTree"][0]["today"])
print(data_all["areaTree"][0]["total"])
print(data_all["areaTree"][0]["children"])
print(len(data_all["areaTree"][0]["children"]))
for i in data_all["areaTree"][0]["children"]:
    print(i["name"])

存入数据库

import pymysql
import time
import json
import traceback
import requests
def get_tencent_data(): 
    """
    :return: 返回历史数据和当日详细数据
    """
    url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5'
    url_his='https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'#加上这个history大兄弟++++++++
    
    
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
    }
    r = requests.get(url, headers)
    res = json.loads(r.text)  # json字符串转字典
    data_all = json.loads(res['data'])
    
    #再加上history的配套东东++++++++
    r_his=requests.get(url_his,headers)
    res_his=json.loads(r_his.text)
    data_his=json.loads(res_his['data'])

    history = {}  # 历史数据
#     for i in data_all["chinaDayList"]:
#         ds = "2020." + i["date"]
#         tup = time.strptime(ds, "%Y.%m.%d")
#         ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错，数据库是datetime类型
#         confirm = i["confirm"]
#         suspect = i["suspect"]
#         heal = i["heal"]
#         dead = i["dead"]
#         history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
#     for i in data_all["chinaDayAddList"]:
#         ds = "2020." + i["date"]
#         tup = time.strptime(ds, "%Y.%m.%d")
#         ds = time.strftime("%Y-%m-%d", tup)
#         confirm = i["confirm"]
#         suspect = i["suspect"]
#         heal = i["heal"]
#         dead = i["dead"]
#         history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})

#通过上面的代码肯定不行了，里面只有当日详细数据，修改也很简单，改一下循环遍历的数据源即可：++++
    for i in data_his["chinaDayList"]:
        ds = "2020." + i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")
        ds = time.strftime("%Y-%m-%d", tup)  # 改变时间格式,不然插入数据库会报错，数据库是datetime类型
        confirm = i["confirm"]
        suspect = i["suspect"]
        heal = i["heal"]
        dead = i["dead"]
        history[ds] = {"confirm": confirm, "suspect": suspect, "heal": heal, "dead": dead}
    for i in data_his["chinaDayAddList"]:
        ds = "2020." + i["date"]
        tup = time.strptime(ds, "%Y.%m.%d")
        ds = time.strftime("%Y-%m-%d", tup)
        confirm = i["confirm"]
        suspect = i["suspect"]
        heal = i["heal"]
        dead = i["dead"]
        history[ds].update({"confirm_add": confirm, "suspect_add": suspect, "heal_add": heal, "dead_add": dead})
        
#下面就不用动了
    details = []  # 当日详细数据
    update_time = data_all["lastUpdateTime"]
    data_country = data_all["areaTree"]  # list 25个国家
    data_province = data_country[0]["children"]  # 中国各省
    for pro_infos in data_province:
        province = pro_infos["name"]  # 省名
        for city_infos in pro_infos["children"]:
            city = city_infos["name"]
            confirm = city_infos["total"]["confirm"]
            confirm_add = city_infos["today"]["confirm"]
            heal = city_infos["total"]["heal"]
            dead = city_infos["total"]["dead"]
            details.append([update_time, province, city, confirm, confirm_add, heal, dead])
    return history, details

posted @ 2021-03-12 22:03 风吹过半夏阅读(116) 评论(0) 收藏举报

刷新页面返回顶部

风吹过半夏

大二下学期第一次结对作业（第一阶段）

urllib的基本使用

request基本使用

beautifulsoup4 解析内容

re解析

爬取疫情数据

存入数据库

公告