python疫情爬虫

一、选题的背景

  1. 201912月以来,湖北省武汉市持续开展流感及相关疾病监测,发现多起病毒性肺炎病例,均诊断为病毒性肺炎/肺部感染
  2. 2020120日,国家主席对新型冠状病毒感染的肺炎疫情作出重要指示,强调要把人民群众生命安全和身体健康放在第一位,坚决遏制疫情蔓延势头
  3. 我们也因当时刻关注疫情发展,希望疫情也能尽快结束。
  4. 我选择爬取国内实时疫情对现在疫情进行分析。

 

二、主题式网络爬虫设计方案

1.主题式网络爬虫名称:新冠疫情数据;

2.主题式网络爬虫爬取的内容与数据特征分析:腾讯新闻实时分析;

3.网络爬虫设计方案概述:从网页源代码中找出数据对应标签,对数据进行分析和可视化处理。

 

三、主题页面的结构特征分析

 

1.

 

 

 

  1. Htmls 页面解析F12;
  2. 节点(标签)查找方法与遍历方法:在所需数据位置右键→查找,即可找到标签;

 

 

三、网络爬虫程序设计

 

1.数据爬取与采集

import requests

import json

import time

import datetime

import matplotlib.pyplot as plt

import numpy as np

 

 

url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)

data = json.loads(requests.get(url=url).json()['data'])

num = data['areaTree'][0]['children']

print(num)

 

 

 

  1. 对数据进行清洗和处理

 

生成文本:

 

 

 

 

分析数据:

 

 

 

 

 

 

对数据进行清洗

 

 

 

 

查看缺失值

 

 

 

 

 

 

显示缺失行列

 

 

 

 

 

查找重复值

 

 

 

删除、统计

 

 

 

4.数据分析与可视化

 

# 解析确诊数据

total_data = {}

for item in num:

    if item['name'] not in total_data:

        total_data.update({item['name']:0})

    for city_data in item['children']:

        total_data[item['name']] +=int(city_data['total']['confirm'])    

 

# 解析疑似数据

total_suspect_data = {}

for item in num:

    if item['name'] not in total_suspect_data:

        total_suspect_data.update({item['name']:0})

    for city_data in item['children']:

        total_suspect_data[item['name']] +=int(city_data['total']['suspect'])    

 

 

# 解析死亡数据

total_dead_data = {}

for item in num:

    if item['name'] not in total_dead_data:

        total_dead_data.update({item['name']:0})

    for city_data in item['children']:

        total_dead_data[item['name']] +=int(city_data['total']['dead'])    

 

# 解析治愈数据

total_heal_data = {}

for item in num:

    if item['name'] not in total_heal_data:

        total_heal_data.update({item['name']:0})

    for city_data in item['children']:

        total_heal_data[item['name']] +=int(city_data['total']['heal'])    

 

# 解析新增确诊数据

total_new_data = {}

for item in num:

    if item['name'] not in total_new_data:

        total_new_data.update({item['name']:0})

    for city_data in item['children']:

        total_new_data[item['name']] +=int(city_data['today']['confirm'])     

 

 

#统计数据并输出

names = list(total_data.keys())

num1 = list(total_data.values())

num2 = list(total_suspect_data.values())

num3 = list(total_dead_data.values())

num4 = list(total_heal_data.values())

num5 = list(total_new_data.values())

 

 

today=datetime.date.today()

f=open('./疫情-%s.csv'%(today),'w',encoding='utf-8')

f.write('省份,确诊人数,死亡人数,治愈人数,新增确诊\n')

i = 0

while i<len(names):

    f.write(names[i]+','+str(num1[i])+','+str(num3[i])+','+str(num4[i])+','+str(num5[i])+'\n')

    i = i + 1

 

 

 

 

 

 

#绘制柱形图

 

plt.figure(figsize=[100,60])

plt.rcParams['font.sans-serif'] = ['SimHei']

plt.rcParams['axes.unicode_minus'] = False    

 

#绘制确诊数据

p1 = plt.subplot(221)

names = total_data.keys()

nums = total_data.values()

print(names)

print(nums)

print(total_data)

plt.bar(names, nums, width=0.5, color='green')

plt.ylabel("确诊人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p1)

 

#绘制新增确诊数据

p2 = plt.subplot(222)

names = total_new_data.keys()

nums = total_new_data.values()

print(names)

print(nums)

plt.bar(names, nums, width=0.5, color='yellow')

plt.ylabel("新增确诊人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p2)

 

#绘制死亡数据

p3 = plt.subplot(223)

names = total_dead_data.keys()

nums = total_dead_data.values()

print(names)

print(nums)

plt.bar(names, nums, width=0.5, color='blue')

plt.xlabel("地区")

plt.ylabel("死亡人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p3)

 

#绘制治愈数据

p4 = plt.subplot(224)

names = total_heal_data.keys()

nums = total_heal_data.values()

print(names)

print(nums)

plt.bar(names, nums, width=0.3, color='red')

plt.xlabel("地区")

plt.ylabel("治愈人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p4)

plt.show()

 

 

 

 

汇总代码

import requests

import json

import time

import datetime

import matplotlib.pyplot as plt

import numpy as np

 

 

url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)

data = json.loads(requests.get(url=url).json()['data'])

num = data['areaTree'][0]['children']

print(num)

 

for i in num:

                with open("疫情.txt","a",encoding="utf-8")as f:

                    f.write(json.dumps(i,ensure_ascii=False,indent=2))

 

 

chinaTotals="确诊人数:"+str(data['chinaTotal']['confirm'])+\

           "疑似人数:"+str(data['chinaTotal']['suspect'])+\

           "死亡人数:"+str(data['chinaTotal']['dead'])+\

           "治愈人数:"+str(data['chinaTotal']['heal'])+\

           "更新日期:"+data['lastUpdateTime']

print(chinaTotals)

 

#将数据转换为二维表方便数据清洗

data1=pd.DataFrame(Total,index=range(1,35),columns=['省份','确诊人数','疑似人数','死亡人数','治愈人数'])

data4=pd.DataFrame(Country,index=range(1,11),columns=['国家','新增确诊人数'])

print(data1)

#查找是否有缺失值

data1.isnull()

data4.isnull()

#只显示存在缺失的行列

data1[data1.isnull().values==True]

data4[data4.isnull().values==True]

#查找重复值

data1.duplicated()

data4.duplicated()

#删除重复值

data2=data1.drop_duplicates()

data5=data4.drop_duplicates()

#统计空值

data2.isna()

data5.isna()

print(data2)

print(data5)

 

# 解析确诊数据

total_data = {}

for item in num:

    if item['name'] not in total_data:

        total_data.update({item['name']:0})

    for city_data in item['children']:

        total_data[item['name']] +=int(city_data['total']['confirm'])    

 

# 解析疑似数据

total_suspect_data = {}

for item in num:

    if item['name'] not in total_suspect_data:

        total_suspect_data.update({item['name']:0})

    for city_data in item['children']:

        total_suspect_data[item['name']] +=int(city_data['total']['suspect'])    

 

 

# 解析死亡数据

total_dead_data = {}

for item in num:

    if item['name'] not in total_dead_data:

        total_dead_data.update({item['name']:0})

    for city_data in item['children']:

        total_dead_data[item['name']] +=int(city_data['total']['dead'])    

 

# 解析治愈数据

total_heal_data = {}

for item in num:

    if item['name'] not in total_heal_data:

        total_heal_data.update({item['name']:0})

    for city_data in item['children']:

        total_heal_data[item['name']] +=int(city_data['total']['heal'])    

 

# 解析新增确诊数据

total_new_data = {}

for item in num:

    if item['name'] not in total_new_data:

        total_new_data.update({item['name']:0})

    for city_data in item['children']:

        total_new_data[item['name']] +=int(city_data['today']['confirm'])     

 

 

#统计数据并输出

names = list(total_data.keys())

num1 = list(total_data.values())

num2 = list(total_suspect_data.values())

num3 = list(total_dead_data.values())

num4 = list(total_heal_data.values())

num5 = list(total_new_data.values())

 

 

today=datetime.date.today()

f=open('./疫情-%s.csv'%(today),'w',encoding='utf-8')

f.write('省份,确诊人数,死亡人数,治愈人数,新增确诊\n')

i = 0

while i<len(names):

    f.write(names[i]+','+str(num1[i])+','+str(num3[i])+','+str(num4[i])+','+str(num5[i])+'\n')

    i = i + 1

 

#绘制柱形图

 

plt.figure(figsize=[100,60])

plt.rcParams['font.sans-serif'] = ['SimHei']

plt.rcParams['axes.unicode_minus'] = False    

 

#绘制确诊数据

p1 = plt.subplot(221)

names = total_data.keys()

nums = total_data.values()

print(names)

print(nums)

print(total_data)

plt.bar(names, nums, width=0.5, color='green')

plt.ylabel("确诊人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p1)

 

#绘制新增确诊数据

p2 = plt.subplot(222)

names = total_new_data.keys()

nums = total_new_data.values()

print(names)

print(nums)

plt.bar(names, nums, width=0.5, color='yellow')

plt.ylabel("新增确诊人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p2)

 

#绘制死亡数据

p3 = plt.subplot(223)

names = total_dead_data.keys()

nums = total_dead_data.values()

print(names)

print(nums)

plt.bar(names, nums, width=0.5, color='blue')

plt.xlabel("地区")

plt.ylabel("死亡人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p3)

 

#绘制治愈数据

p4 = plt.subplot(224)

names = total_heal_data.keys()

nums = total_heal_data.values()

print(names)

print(nums)

plt.bar(names, nums, width=0.3, color='red')

plt.xlabel("地区")

plt.ylabel("治愈人数", rotation=90,size=50)

plt.xticks(list(names), rotation=-60, size=50)

for a, b in zip(list(names), list(nums)):

    plt.text(a, b, b, ha='center', va='bottom', size=35)

plt.sca(p4)

plt.show()

 

 

五、总结

1.由数据可以看出确诊人数,死亡人数和治愈人数湖北均较高

台湾新增人数明显较高;

2.本次作业了解到自身短板过多,很多知识没有学习透彻,导致遇到在过程中遇到很多问题,并且不能完整的完成本次作业,之后得多复习学过的知识。

 

posted @ 2021-06-21 18:11  气氕氘氚  阅读(128)  评论(0)    收藏  举报