python爬虫课程网络设计（中国各省疫情分析）

1、选题背景

全国上下统一部署全力防控疫情扩散。我们可以从多个渠道获取疫情发展的最新数据，网上也有不少程序爬取相关数据，并做可视化的案例。并在自己所学的范围里进行程序设计

目的是为了对新冠疫情进行实时观测。

2、题式爬虫方案

（1）主题式爬虫名称：新冠肺炎实时数据及其可视化

（2）爬取的内容与数据特征：通过爬取腾讯实时疫情追踪（url=https://news.qq.com/zt2020/page/feiyan.htm#/），爬取实时性的新冠数据。

（3）主题式网络爬虫设计方案概述：通过网络爬虫对疫情进行爬取，爬取后将其进行整理，同时对数据进行运用绘制成柱状图，再次进行爬取保存。

难点：对于json的应用

3、主题页面的结构特征分析

（1）主题页面的结构与特征分析。

（2）Htmls页面解析

3、节点标签查找方法与遍历方法

4、网络爬虫程序设计

（1）数据的爬取与采集

（2）对数据进行清洗和处理

 1 import time, json, requests
 2 import csv
 3 
 4 
 5 #文件名称
 6 ExcelName = '2.3疫情日报.csv'
 7 
 8 #当前日期时间戳
 9 number = format(time.time() * 100, '.0f')
10 
11 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%s' % number
12 datas = json.loads(requests.get(url=url).json()['data'])
13 
14 print('更新时间：' + datas['lastUpdateTime'])
15 #写入更新时间
16 
17 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
18     writer = csv.writer(csvfile)
19     writer.writerow(['更新时间：' + datas['lastUpdateTime']])
20 
21 for contry in datas['areaTree']:
22     if contry['name'] == '中国':
23         for province in contry['children']:
24             print(province['name'])
25             #写入省份名称
26 
27             with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
28                 writer = csv.writer(csvfile)
29                 writer.writerow([province['name']])
30             for city in province['children']:
31                 print(city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal']))
32                 # 写入市的名称，确诊、死亡、治愈的人数
33 
34                 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
35                     writer = csv.writer(csvfile)
36                     writer.writerow([city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal'])])

所得数据为

（3）数据分析与可视化

 1 import time
 2 import json
 3 import requests
 4 
 5 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
 6 
 7 # 抓取腾讯疫情实时json数据
 8 data = json.loads(requests.get(url=url).json()['data'])
 9 
10 # 数据太多，打印一个键值对
11 print(data['lastUpdateTime'])
12 
13 type(data),len(data)
14 
15 
16 print(data.keys())
17 
18 
19 # 统计省份信息
20 num = data['areaTree'][0]['children']
21 # print(num)
22 print(num[15])   
23 
24 # 数据太多，打印部分
25 
26 hunan = num[15]  
27 
28 hunan.keys()
29 
30 hunan['total']

1 hunan['children'][0]['total']  # 武汉总数据

 1 # 解析每个省份确诊的总人数
 2 total_data = {}
 3 
 4 for item in num:
 5 
 6     if item['name'] not in total_data:
 7         total_data.update({item['name']:0})
 8     for city_data in item['children']:
 9         total_data[item['name']] += int(city_data['total']['confirm'])    
10 print(total_data)
11 
12 
13 print(hunan_children_total_data)
14 
15 hb_names = hunan_children_total_data.keys()
16 
17 hb_numbers = hunan_children_total_data.values()
18 
19 
20 import matplotlib.pyplot as plt 
21 import numpy as np
22 plt.rcParams['font.sans-serif'] = ['simhei']  
23  # 用来正常显示中文标签
24 
25 # 绘图
26 plt.figure(figsize=[12,8])
27 
28 plt.bar(hb_names,hb_numbers)
29 
30 plt.xlabel("地区", size=12)
31 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
32 plt.title("湖南省不同地区疫情确诊数对比图", size=16)
33 plt.xticks(list(hb_names), rotation=90, size=12)
34     
35 plt.show()
36 
37 names = total_data.keys()
38 print(names)
39 
40 
41 numbers = total_data.values()
42 print(numbers)
43 
44 
45 import matplotlib.pyplot as plt 
46 import numpy as np
47 
48 plt.rcParams['font.sans-serif'] = ['simhei']  
49 
50  # 用来正常显示中文标签
51 
52 # 绘图
53 plt.figure(figsize=[12,8])
54 
55 plt.bar(names,numbers)
56 
57 plt.xlabel("地区", size=12)
58 plt.ylabel("人数", fontproperties='SimHei',rotation=90 ,size=12)
59 plt.title("中国不同省份疫情确诊数对比图", size=16)
60 plt.xticks(list(names),  size=12)
61     
62 plt.show()

 1 import time, json, requests
 2 
 3 # 抓取腾讯疫情实时json数据
 4 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d' % int(time.time() * 1000)
 5 data = json.loads(requests.get(url=url).json()['data'])
 6 print(data)
 7 print(data.keys())
 8 
 9 # 统计省份信息(34个省份 湖北 广东 河南 浙江 湖南 安徽....)
10 num = data['areaTree'][0]['children']
11 print(len(num))
12 for item in num:
13     print(item['name'], end=" ")  # 不换行
14 else:
15     print("\n")  # 换行
16 
17 # 显示湖北省数据
18 hubei = num[23]['children']
19 for item in hubei:
20     print(item)
21 else:
22     print("\n")
23 
24 # 解析确诊数据
25 total_data = {}
26 for item in num:
27     if item['name'] not in total_data:
28         total_data.update({item['name']: 0})
29     for city_data in item['children']:
30         total_data[item['name']] += int(city_data['total']['confirm'])
31 print(total_data)
32 
33 
34 # 解析疑似数据
35 total_suspect_data = {}
36 for item in num:
37     if item['name'] not in total_suspect_data:
38         total_suspect_data.update({item['name']: 0})
39     for city_data in item['children']:
40         total_suspect_data[item['name']] += int(city_data['total']['suspect'])
41 print(total_suspect_data)
42 
43 # 解析死亡数据
44 total_dead_data = {}
45 for item in num:
46     if item['name'] not in total_dead_data:
47         total_dead_data.update({item['name']: 0})
48     for city_data in item['children']:
49         total_dead_data[item['name']] += int(city_data['total']['dead'])
50 print(total_dead_data)
51 
52 # 解析治愈数据
53 total_heal_data = {}
54 for item in num:
55     if item['name'] not in total_heal_data:
56         total_heal_data.update({item['name']: 0})
57     for city_data in item['children']:
58         total_heal_data[item['name']] += int(city_data['total']['heal'])
59 print(total_heal_data)
60 
61 # 解析新增确诊数据
62 total_new_data = {}
63 for item in num:
64     if item['name'] not in total_new_data:
65         total_new_data.update({item['name']: 0})
66     for city_data in item['children']:
67         total_new_data[item['name']] += int(city_data['today']['confirm'])  # today
68 print(total_new_data)
69 
70 # ------------------------------------------------------------------------------
71 # 第二步：存储数据至CSV文件
72 # ------------------------------------------------------------------------------
73 names = list(total_data.keys())  # 省份名称
74 num1 = list(total_data.values())  # 确诊数据
75 num2 = list(total_suspect_data.values())  # 疑似数据(全为0)
76 num3 = list(total_dead_data.values())  # 死亡数据
77 num4 = list(total_heal_data.values())  # 治愈数据
78 num5 = list(total_new_data.values())  # 新增确诊病例
79 print(names)
80 print(num1)
81 print(num2)
82 print(num3)
83 print(num4)
84 print(num5)
85 
86 # 获取当前日期命名(2020-12-27-all.csv)
87 n = time.strftime("%Y-%m-%d") + "-all.csv"
88 fw = open(n, 'w', encoding='utf-8')
89 fw.write('province,confirm,dead,heal,new_confirm\n')
90 i = 0
91 while i < len(names):
92     fw.write(names[i] + ',' + str(num1[i]) + ',' + str(num3[i]) + ',' + str(num4[i]) + ',' + str(num5[i]) + '\n')
93     i = i + 1
94 else:
95     print("Over write file!")
96     fw.close()

1 import pandas as pd
2 #导入数据
3 n = time.strftime("%Y-%m-%d") + "-all.csv"
4 data = pd.read_csv(n)
5 df_world = pd.read_csv(n)
6 # 查看数据的简要信息
7 df_world.describe()

 1 import matplotlib.pyplot as plt
 2 import pandas as pd
 3 # 创建画布
 4 plt.figure(figsize=(15,10))
 5 #中文字体
 6 plt.rcParams['font.family'] = ['SimHei']
 7 #导入数据
 8 n = time.strftime("%Y-%m-%d") + "-all.csv"
 9 data = pd.read_csv(n)
10 df_world = pd.read_csv(n)
11 df_citi = pd.read_csv(n)
12 labels = df_citi['province'].values
13 data = df_citi['confirm'].values
14 plt.pie(data ,labels=labels, autopct='%1.1f%%',radius=1)
15 #设置显示图像为圆形
16 plt.axis('equal')
17 # 标题
18 plt.title('全国各省新冠疫情比例')
19 plt.show()

总代码：

  1 import time, json, requests
  2 import csv
  3 
  4 
  5 #文件名称
  6 ExcelName = '2.3疫情日报.csv'
  7 
  8 #当前日期时间戳
  9 number = format(time.time() * 100, '.0f')
 10 
 11 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%s' % number
 12 datas = json.loads(requests.get(url=url).json()['data'])
 13 
 14 print('更新时间：' + datas['lastUpdateTime'])
 15 #写入更新时间
 16 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
 17     writer = csv.writer(csvfile)
 18     writer.writerow(['更新时间：' + datas['lastUpdateTime']])
 19 
 20 for contry in datas['areaTree']:
 21     if contry['name'] == '中国':
 22         for province in contry['children']:
 23             print(province['name'])
 24             #写入省份名称
 25             with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
 26                 writer = csv.writer(csvfile)
 27                 writer.writerow([province['name']])
 28             for city in province['children']:
 29                 print(city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal']))
 30                 # 写入市的名称，确诊、死亡、治愈的人数
 31                 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile:
 32                     writer = csv.writer(csvfile)
 33                     writer.writerow([city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal'])])
 34 import time
 35 import json
 36 import requests
 37 
 38 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000)
 39 
 40 # 抓取腾讯疫情实时json数据
 41 data = json.loads(requests.get(url=url).json()['data'])
 42 
 43 # 数据太多，打印一个键值对
 44 print(data['lastUpdateTime'])
 45 
 46 type(data),len(data)
 47 
 48 
 49 print(data.keys())
 50 
 51 
 52 # 统计省份信息
 53 num = data['areaTree'][0]['children']
 54 # print(num)
 55 print(num[15])
 56 
 57 # 数据太多，打印部分
 58 
 59 hunan = num[15]
 60 
 61 hunan.keys()
 62 
 63 hunan['total']
 64 # 解析每个省份确诊的总人数
 65 total_data = {}
 66 
 67 for item in num:
 68 
 69     if item['name'] not in total_data:
 70         total_data.update({item['name']:0})
 71     for city_data in item['children']:
 72         total_data[item['name']] += int(city_data['total']['confirm'])
 73 print(total_data)
 74 
 75 
 76 print(hunan_children_total_data)
 77 
 78 hb_names = hunan_children_total_data.keys()
 79 
 80 hb_numbers = hunan_children_total_data.values()
 81 
 82 
 83 import matplotlib.pyplot as plt
 84 import numpy as np
 85 plt.rcParams['font.sans-serif'] = ['simhei']
 86  # 用来正常显示中文标签
 87 
 88 # 绘图
 89 plt.figure(figsize=[12,8])
 90 
 91 plt.bar(hb_names,hb_numbers)
 92 
 93 plt.xlabel("地区", size=12)
 94 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12)
 95 plt.title("湖南省不同地区疫情确诊数对比图", size=16)
 96 plt.xticks(list(hb_names), rotation=90, size=12)
 97 
 98 plt.show()
 99 
100 names = total_data.keys()
101 print(names)
102 
103 
104 numbers = total_data.values()
105 print(numbers)
106 
107 
108 import matplotlib.pyplot as plt
109 import numpy as np
110 
111 plt.rcParams['font.sans-serif'] = ['simhei']
112 
113  # 用来正常显示中文标签
114 
115 # 绘图
116 plt.figure(figsize=[12,8])
117 
118 plt.bar(names,numbers)
119 
120 plt.xlabel("地区", size=12)
121 plt.ylabel("人数", fontproperties='SimHei',rotation=90 ,size=12)
122 plt.title("中国不同省份疫情确诊数对比图", size=16)
123 plt.xticks(list(names),  size=12)
124 
125 plt.show()
126 import time, json, requests
127 
128 # 抓取腾讯疫情实时json数据
129 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d' % int(time.time() * 1000)
130 data = json.loads(requests.get(url=url).json()['data'])
131 print(data)
132 print(data.keys())
133 
134 # 统计省份信息(34个省份 湖北 广东 河南 浙江 湖南 安徽....)
135 num = data['areaTree'][0]['children']
136 print(len(num))
137 for item in num:
138     print(item['name'], end=" ")  # 不换行
139 else:
140     print("\n")  # 换行
141 
142 # 显示湖北省数据
143 hubei = num[23]['children']
144 for item in hubei:
145     print(item)
146 else:
147     print("\n")
148 
149 # 解析确诊数据
150 total_data = {}
151 for item in num:
152     if item['name'] not in total_data:
153         total_data.update({item['name']: 0})
154     for city_data in item['children']:
155         total_data[item['name']] += int(city_data['total']['confirm'])
156 print(total_data)
157 
158 
159 # 解析疑似数据
160 total_suspect_data = {}
161 for item in num:
162     if item['name'] not in total_suspect_data:
163         total_suspect_data.update({item['name']: 0})
164     for city_data in item['children']:
165         total_suspect_data[item['name']] += int(city_data['total']['suspect'])
166 print(total_suspect_data)
167 
168 # 解析死亡数据
169 total_dead_data = {}
170 for item in num:
171     if item['name'] not in total_dead_data:
172         total_dead_data.update({item['name']: 0})
173     for city_data in item['children']:
174         total_dead_data[item['name']] += int(city_data['total']['dead'])
175 print(total_dead_data)
176 
177 # 解析治愈数据
178 total_heal_data = {}
179 for item in num:
180     if item['name'] not in total_heal_data:
181         total_heal_data.update({item['name']: 0})
182     for city_data in item['children']:
183         total_heal_data[item['name']] += int(city_data['total']['heal'])
184 print(total_heal_data)
185 
186 # 解析新增确诊数据
187 total_new_data = {}
188 for item in num:
189     if item['name'] not in total_new_data:
190         total_new_data.update({item['name']: 0})
191     for city_data in item['children']:
192         total_new_data[item['name']] += int(city_data['today']['confirm'])  # today
193 print(total_new_data)
194 
195 # ------------------------------------------------------------------------------
196 # 第二步：存储数据至CSV文件
197 # ------------------------------------------------------------------------------
198 names = list(total_data.keys())  # 省份名称
199 num1 = list(total_data.values())  # 确诊数据
200 num2 = list(total_suspect_data.values())  # 疑似数据(全为0)
201 num3 = list(total_dead_data.values())  # 死亡数据
202 num4 = list(total_heal_data.values())  # 治愈数据
203 num5 = list(total_new_data.values())  # 新增确诊病例
204 print(names)
205 print(num1)
206 print(num2)
207 print(num3)
208 print(num4)
209 print(num5)
210 
211 # 获取当前日期命名(2020-12-27-all.csv)
212 n = time.strftime("%Y-%m-%d") + "-all.csv"
213 fw = open(n, 'w', encoding='utf-8')
214 fw.write('province,confirm,dead,heal,new_confirm\n')
215 i = 0
216 while i < len(names):
217     fw.write(names[i] + ',' + str(num1[i]) + ',' + str(num3[i]) + ',' + str(num4[i]) + ',' + str(num5[i]) + '\n')
218     i = i + 1
219 else:
220     print("Over write file!")
221     fw.close()
222 import time, json, requests
223 
224 # 抓取腾讯疫情实时json数据
225 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d' % int(time.time() * 1000)
226 data = json.loads(requests.get(url=url).json()['data'])
227 print(data)
228 print(data.keys())
229 
230 # 统计省份信息(34个省份 湖北 广东 河南 浙江 湖南 安徽....)
231 num = data['areaTree'][0]['children']
232 print(len(num))
233 for item in num:
234     print(item['name'], end=" ")  # 不换行
235 else:
236     print("\n")  # 换行
237 
238 # 显示湖北省数据
239 hubei = num[23]['children']
240 for item in hubei:
241     print(item)
242 else:
243     print("\n")
244 
245 # 解析确诊数据
246 total_data = {}
247 for item in num:
248     if item['name'] not in total_data:
249         total_data.update({item['name']: 0})
250     for city_data in item['children']:
251         total_data[item['name']] += int(city_data['total']['confirm'])
252 print(total_data)
253 
254 
255 # 解析疑似数据
256 total_suspect_data = {}
257 for item in num:
258     if item['name'] not in total_suspect_data:
259         total_suspect_data.update({item['name']: 0})
260     for city_data in item['children']:
261         total_suspect_data[item['name']] += int(city_data['total']['suspect'])
262 print(total_suspect_data)
263 
264 # 解析死亡数据
265 total_dead_data = {}
266 for item in num:
267     if item['name'] not in total_dead_data:
268         total_dead_data.update({item['name']: 0})
269     for city_data in item['children']:
270         total_dead_data[item['name']] += int(city_data['total']['dead'])
271 print(total_dead_data)
272 
273 # 解析治愈数据
274 total_heal_data = {}
275 for item in num:
276     if item['name'] not in total_heal_data:
277         total_heal_data.update({item['name']: 0})
278     for city_data in item['children']:
279         total_heal_data[item['name']] += int(city_data['total']['heal'])
280 print(total_heal_data)
281 
282 # 解析新增确诊数据
283 total_new_data = {}
284 for item in num:
285     if item['name'] not in total_new_data:
286         total_new_data.update({item['name']: 0})
287     for city_data in item['children']:
288         total_new_data[item['name']] += int(city_data['today']['confirm'])  # today
289 print(total_new_data)
290 
291 # ------------------------------------------------------------------------------
292 # 第二步：存储数据至CSV文件
293 # ------------------------------------------------------------------------------
294 names = list(total_data.keys())  # 省份名称
295 num1 = list(total_data.values())  # 确诊数据
296 num2 = list(total_suspect_data.values())  # 疑似数据(全为0)
297 num3 = list(total_dead_data.values())  # 死亡数据
298 num4 = list(total_heal_data.values())  # 治愈数据
299 num5 = list(total_new_data.values())  # 新增确诊病例
300 print(names)
301 print(num1)
302 print(num2)
303 print(num3)
304 print(num4)
305 print(num5)
306 
307 # 获取当前日期命名(2020-12-27-all.csv)
308 n = time.strftime("%Y-%m-%d") + "-all.csv"
309 fw = open(n, 'w', encoding='utf-8')
310 fw.write('province,confirm,dead,heal,new_confirm\n')
311 i = 0
312 while i < len(names):
313     fw.write(names[i] + ',' + str(num1[i]) + ',' + str(num3[i]) + ',' + str(num4[i]) + ',' + str(num5[i]) + '\n')
314     i = i + 1
315 else:
316     print("Over write file!")
317     fw.close()
318  import pandas as pd
319  #导入数据
320  n = time.strftime("%Y-%m-%d") + "-all.csv"
321  data = pd.read_csv(n)
322  df_world = pd.read_csv(n)
323  # 查看数据的简要信息
324  df_world.describe()
325 import matplotlib.pyplot as plt
326 import pandas as pd
327 # 创建画布
328 plt.figure(figsize=(15,10))
329 #中文字体
330 plt.rcParams['font.family'] = ['SimHei']
331 #导入数据
332 n = time.strftime("%Y-%m-%d") + "-all.csv"
333 data = pd.read_csv(n)
334 df_world = pd.read_csv(n)
335 df_citi = pd.read_csv(n)
336 labels = df_citi['province'].values
337 data = df_citi['confirm'].values
338 plt.pie(data ,labels=labels, autopct='%1.1f%%',radius=1)
339 #设置显示图像为圆形
340 plt.axis('equal')
341 # 标题
342 plt.title('全国各省新冠疫情比例')
343 plt.show()

5、总结

我从这次通过网络爬虫直观地观察到现在中国疫情的严重程度。

从本次学习中我有了很大收获，同时也发现了自己的很多问题，比如对json的格式的不标准，需要学习更多的python知识，为以后的就业打好基础。

posted @ 2021-12-28 21:33 TA是她的喵阅读(394) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

TA是她的喵

python爬虫课程网络设计（中国各省疫情分析）

公告