python课程设计——网络爬虫爬取新冠疫情实时信息

一、选题背景

对自2019年以来在我国蔓延的新冠肺炎疫情进行数据分析,对各省疫情的确诊人数的分析,更直观的看出疫情防控消息,进一步确定对防疫战略的部署与规划,增加资源利用。

二、设计方案

1.新冠疫情信息爬取与解析

数据源:https://news.qq.com/

2.主题式网络爬虫爬取的内容与数据特征分析

(1)HTML页面解析

 

(2)节点(标签)查找方法与遍历方法

三、程序设计

1、数据爬取与采集

对腾讯新闻进行爬取

1 # 抓取腾讯新闻疫情实时json数据
2 import time
3 import json
4 import requests
5 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
6 data = json.loads(requests.get(url=url).json()['data'])
7 print(data)

 

 2、对数据进行处理

 当日数据(today),总数据(total),confirm表示确诊、suspect表示疑似、dead表示死亡、heal表示治愈。

(1)疫情数据现状

1 #全球疫情数据
2 data['areaTree']
3 #国内疫情数据
4 data['areaTree'][0]

 

 

 

 

 (2)通过分析陕西省、福建省数据,分析全国信息

  陕西省:

1 #陕西省的疫情数据
2 shanxi=num[1]
3 shanxi['total']
4 #西安疫情数据
5 shanxi['children'][0]['total']

1 # 解析陕西省不同地区确诊的总数据
2 shanxi_children_total_data = {}
3 for item in shanxi['children']:
4     if item['name'] not in shanxi_children_total_data:
5         shanxi_children_total_data.update({item['name']:0})
6     shanxi_children_total_data[item['name']] += int(item['total']['confirm']) 
7 print(shanxi_children_total_data)

福建省:

1 # 福建省省份信息
2 num = data['areaTree'][0]['children']
3 fujian=num[10]
4 #厦门疫情数据
5 fujian['children'][1]['total']

 

1 # 解析福建省不同地区确诊的总数据
2 fujian_children_total_data = {}
3 for item in fujian['children']:
4     if item['name'] not in fujian_children_total_data:
5         fujian_children_total_data.update({item['name']:0})
6     fujian_children_total_data[item['name']] += int(item['total']['confirm']) 
7 print(fujian_children_total_data)

全国各省确诊数据

1 # 解析每个省份确诊的总人数
2 total_data = {}
3 for item in num:
4     if item['name'] not in total_data:
5         total_data.update({item['name']:0})
6     for city_data in item['children']:
7         total_data[item['name']] += int(city_data['total']['confirm'])    
8 print(total_data)

 (3)对数据进行分析

  用Matplotlib对数据进行绘制

 1 sx_names = shanxi_children_total_data.keys()
 2 sx_numbers = shanxi_children_total_data.values()
 3 import matplotlib.pyplot as plt 
 4 import numpy as np
 5 plt.rcParams['font.sans-serif'] = ['simhei'] 
 6 
 7 # 绘图
 8 plt.figure(figsize=[12,8])
 9 
10 plt.bar(sx_names,sx_numbers)
11 
12 plt.xlabel("地区", size=12)
13 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
14 plt.title("陕西省不同地区疫情确诊数对比图", size=16)
15 plt.xticks(list(sx_names), rotation=90, size=12)
16     
17 plt.show()

 

 

 1 fj_names = fujian_children_total_data.keys()
 2 fj_numbers = fujian_children_total_data.values()
 3 import matplotlib.pyplot as plt 
 4 import numpy as np
 5 plt.rcParams['font.sans-serif'] = ['simhei'] 
 6 
 7 # 绘图
 8 plt.figure(figsize=[12,8])
 9 
10 plt.bar(fj_names,fj_numbers)
11 
12 plt.xlabel("地区", size=12)
13 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
14 plt.title("福建省不同地区疫情确诊数对比图", size=16)
15 plt.xticks(list(fj_names), rotation=90, size=12)
16     
17 plt.show()

 

 

 1 names = total_data.keys()
 2 numbers = total_data.values()
 3 import matplotlib.pyplot as plt 
 4 import numpy as np
 5 plt.rcParams['font.sans-serif'] = ['simhei']
 6 # 绘图
 7 plt.figure(figsize=[12,8])
 8 
 9 plt.bar(names,numbers)
10 
11 plt.xlabel("地区", size=12)
12 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
13 plt.title("中国不同省份疫情确诊数对比图", size=16)
14 plt.xticks(list(names), rotation=90, size=12)
15     
16 plt.show()

 1 #陕西省新增疑似确诊
 2 import json
 3 import requests
 4 import pandas as pd
 5 import matplotlib.pyplot as plt
 6 import numpy as np
 7 import time
 8 import json
 9 import requests
10 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
11 # 抓取腾讯新闻疫情实时json数据
12 resp=requests.get(url)
13 html=resp.json()
14 data=json.loads(html["data"])
15 data['areaTree'][0]['children'][1]
16 da=[]
17 for item in data['areaTree'][0]['children'][1]:
18     da.append(item)
19 
20 df = pd.DataFrame({})
21 df["date"] =  [x[0] for x in da]
22 df["confirm"] = [x[1] for x in da]
23 df["suspect"] =  [x[2] for x in da]
24 #设置横坐标间隔
25 limit=1
26 # 设置横坐标的刻度与显示标签
27 plt.xticks(index[::limit], df['date'][::limit],rotation=90)
28 plt.title("每日陕西省新增疑似确诊")
29 ax1 = plt.gca()
30 #设置坐标标题
31 ax1.set(xlabel='date', ylabel='人数')
32 #解决画图中文乱码问题
33 plt.rcParams['font.sans-serif'] = ['SimHei']
34 plt.rcParams['axes.unicode_minus'] = False
35 line1, = ax1.plot(index, df['confirm'], color='black',linestyle= '-',linewidth=2,alpha=0.9,label='confirm')
36 line2, = ax1.plot(index, df['suspect'], 'grey', label='suspect')
37 #设置图例
38 plt.legend(handles=[line1, line2])
39 plt.show()

 

 

 

 1 #每日福建省新增疑似确诊
 2 import json
 3 import requests
 4 import pandas as pd
 5 import matplotlib.pyplot as plt
 6 import numpy as np
 7 import time
 8 import json
 9 import requests
10 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
11 # 抓取腾讯新闻疫情实时json数据
12 resp=requests.get(url)
13 html=resp.json()
14 data=json.loads(html["data"])
15 data['areaTree'][0]['children'][10]
16 da=[]
17 for item in data['areaTree'][0]['children'][10]:
18     da.append(item)
19 
20 df = pd.DataFrame({})
21 df["date"] =  [x[0] for x in da]
22 df["confirm"] = [x[1] for x in da]
23 df["suspect"] =  [x[2] for x in da]
24 #设置横坐标间隔
25 limit=1
26 # 设置横坐标的刻度与显示标签
27 plt.xticks(index[::limit], df['date'][::limit],rotation=90)
28 plt.title("每日福建省新增疑似确诊")
29 ax1 = plt.gca()
30 #设置坐标标题
31 ax1.set(xlabel='date', ylabel='人数')
32 #解决画图中文乱码问题
33 plt.rcParams['font.sans-serif'] = ['SimHei']
34 plt.rcParams['axes.unicode_minus'] = False
35 line1, = ax1.plot(index, df['confirm'], color='black',linestyle= '-',linewidth=2,alpha=0.9,label='confirm')
36 line2, = ax1.plot(index, df['suspect'], 'grey', label='suspect')
37 #设置图例
38 plt.legend(handles=[line1, line2])
39 plt.show()

 完整代码:

  1 # 抓取腾讯新闻疫情实时json数据
  2 import time
  3 import json
  4 import requests
  5 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
  6 data = json.loads(requests.get(url=url).json()['data'])
  7 print(data)
  8 
  9 #全球疫情数据
 10 data['areaTree']
 11 #国内疫情数据
 12 data['areaTree'][0]
 13 
 14 #陕西省的疫情数据
 15 num = data['areaTree'][0]['children']
 16 shanxi=num[1]
 17 shanxi['total']
 18 #西安疫情数据
 19 shanxi['children'][0]['total']
 20 # 解析陕西省不同地区确诊的总数据
 21 shanxi_children_total_data = {}
 22 for item in shanxi['children']:
 23     if item['name'] not in shanxi_children_total_data:
 24         shanxi_children_total_data.update({item['name']:0})
 25         shanxi_children_total_data[item['name']] += int(item['total']['confirm']) 
 26 print(shanxi_children_total_data)
 27 
 28 # 福建省省份信息
 29 num = data['areaTree'][0]['children']
 30 fujian=num[10]
 31 #厦门疫情数据
 32 fujian['children'][1]['total']
 33 # 解析福建省不同地区确诊的总数据
 34 fujian_children_total_data = {}
 35 for item in fujian['children']:
 36     if item['name'] not in fujian_children_total_data:
 37         fujian_children_total_data.update({item['name']:0})
 38         fujian_children_total_data[item['name']] += int(item['total']['confirm']) 
 39 print(fujian_children_total_data)
 40 
 41 # 解析每个省份确诊的总人数
 42 total_data = {}
 43 for item in num:
 44     if item['name'] not in total_data:
 45         total_data.update({item['name']:0})
 46     for city_data in item['children']:
 47             total_data[item['name']] += int(city_data['total']['confirm'])    
 48 print(total_data)
 49 
 50 #陕西省不同地区疫情对比图
 51 sx_names = shanxi_children_total_data.keys()
 52 sx_numbers = shanxi_children_total_data.values()
 53 import matplotlib.pyplot as plt 
 54 import numpy as np
 55 plt.rcParams['font.sans-serif'] = ['simhei'] 
 56  
 57 # 绘图
 58 plt.figure(figsize=[12,8])
 59 
 60 plt.bar(sx_names,sx_numbers)
 61  
 62 plt.xlabel("地区", size=12)
 63 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
 64 plt.title("陕西省不同地区疫情确诊数对比图", size=16)
 65 plt.xticks(list(sx_names), rotation=90, size=12)
 66 plt.show()
 67 
 68 #陕西省新增疑似确诊
 69 import json
 70 import requests
 71 import pandas as pd
 72 import matplotlib.pyplot as plt
 73 import numpy as np
 74 import time
 75 import json
 76 import requests
 77 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
 78 resp=requests.get(url)
 79 html=resp.json()
 80 data=json.loads(html["data"])
 81 data['areaTree'][0]['children'][1]
 82 da=[]
 83 for item in data['areaTree'][0]['children'][10]:
 84     da.append(item)
 85 df = pd.DataFrame({})
 86 df["date"] =  [x[0] for x in da]
 87 df["confirm"] = [x[1] for x in da]
 88 df["suspect"] =  [x[2] for x in da]
 89 #设置横坐标间隔
 90 limit=1
 91 # 设置横坐标的刻度与显示标签
 92 plt.xticks(index[::limit], df['date'][::limit],rotation=90)
 93 plt.title("每日陕西省新增疑似确诊")
 94 ax1 = plt.gca()
 95 #设置坐标标题
 96 ax1.set(xlabel='date', ylabel='人数')
 97 #解决画图中文乱码问题
 98 plt.rcParams['font.sans-serif'] = ['SimHei']
 99 plt.rcParams['axes.unicode_minus'] = False
100 line1, = ax1.plot(index, df['confirm'], color='black',linestyle= '-',linewidth=2,alpha=0.9,label='confirm')
101 line2, = ax1.plot(index, df['suspect'], 'grey', label='suspect')
102 #设置图例
103 plt.legend(handles=[line1, line2])
104 plt.show()
105 
106 #福建省不同地区疫情对比图
107 fj_names = fujian_children_total_data.keys()
108 fj_numbers = fujian_children_total_data.values()
109 import matplotlib.pyplot as plt 
110 import numpy as np
111 plt.rcParams['font.sans-serif'] = ['simhei'] 
112  
113 # 绘图
114 plt.figure(figsize=[12,8])
115 
116 plt.bar(fj_names,fj_numbers)
117 
118 plt.xlabel("地区", size=12)
119 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
120 plt.title("福建省不同地区疫情确诊数对比图", size=16)
121 plt.xticks(list(fj_names), rotation=90, size=12)
122 plt.show()
123 
124 #每日福建省新增疑似确诊
125 import json
126 import requests
127 import pandas as pd
128 import matplotlib.pyplot as plt
129 import numpy as np
130 import time
131 import json
132 import requests
133 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
134 resp=requests.get(url)
135 html=resp.json()
136 data=json.loads(html["data"])
137 data['areaTree'][0]['children'][10]
138 da=[]
139 for item in data['areaTree'][0]['children'][10]:
140     da.append(item)
141 
142 df = pd.DataFrame({})
143 df["date"] =  [x[0] for x in da]
144 df["confirm"] = [x[1] for x in da]
145 df["suspect"] =  [x[2] for x in da]
146 #设置横坐标间隔
147 limit=1
148 # 设置横坐标的刻度与显示标签
149 plt.xticks(index[::limit], df['date'][::limit],rotation=90)
150 plt.title("每日福建省新增疑似确诊")
151 ax1 = plt.gca()
152 #设置坐标标题
153 ax1.set(xlabel='date', ylabel='人数')
154 #解决画图中文乱码问题
155 plt.rcParams['font.sans-serif'] = ['SimHei']
156 plt.rcParams['axes.unicode_minus'] = False
157 line1, = ax1.plot(index, df['confirm'], color='black',linestyle= '-',linewidth=2,alpha=0.9,label='confirm')
158 line2, = ax1.plot(index, df['suspect'], 'grey', label='suspect')
159 #设置图例
160 plt.legend(handles=[line1, line2])
161 plt.show()
162 
163 #全国不同地区疫情对比图
164 names = total_data.keys()
165 numbers = total_data.values()
166 import matplotlib.pyplot as plt 
167 import numpy as np
168 plt.rcParams['font.sans-serif'] = ['simhei']
169 # 绘图
170 plt.figure(figsize=[12,8])
171 
172 plt.bar(names,numbers)
173 
174 plt.xlabel("地区", size=12)
175 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
176 plt.title("中国不同省份疫情确诊数对比图", size=16)
177 plt.xticks(list(names), rotation=90, size=12)
178 plt.show()

四、总结

1.通过对新冠疫情信息的爬取,进而进行分析,我了解到这次疫情逐渐好转,很多省份清零,但是近期仍有省份如台湾、陕西等地新增人数较多,多为境外输入,疫情逐渐步入尾声,尽管仍有些地区存在新增人口现象,但都处于控制中,相信在不久的将来疫情就能完全结束,但希望现阶段大家还是不能放松警惕,出门戴口罩,勤洗手,照顾好自己同时照顾好别人,不让家人和朋友担心,不给政府和国家添麻烦。

2.个人收获:通过这次网络爬虫对新冠疫情爬取,我掌握了数据清洗的基本步骤,熟悉了用matplotlib制图,加深了对网络爬虫的理解。

posted @ 2021-12-27 23:39  对wcj心动  阅读(460)  评论(0编辑  收藏  举报