python爬虫课程网络设计(中国各省疫情分析)
1、选题背景
全国上下统一部署全力防控疫情扩散。我们可以从多个渠道获取疫情发展的最新数据,网上也有不少程序爬取相关数据,并做可视化的案例。并在自己所学的范围里进行程序设计
目的是为了对新冠疫情进行实时观测。
2、题式爬虫方案
(1)主题式爬虫名称:新冠肺炎实时数据及其可视化
(2)爬取的内容与数据特征:通过爬取腾讯实时疫情追踪(url=https://news.qq.com/zt2020/page/feiyan.htm#/),爬取实时性的新冠数据。
(3)主题式网络爬虫设计方案概述:通过网络爬虫对疫情进行爬取,爬取后将其进行整理,同时对数据进行运用绘制成柱状图,再次进行爬取保存。
难点:对于json的应用
3、主题页面的结构特征分析
(1)主题页面的结构与特征分析。
(2)Htmls页面解析
.
3、节点标签查找方法与遍历方法
4、网络爬虫程序设计
(1)数据的爬取与采集
(2)对数据进行清洗和处理
1 import time, json, requests 2 import csv 3 4 5 #文件名称 6 ExcelName = '2.3疫情日报.csv' 7 8 #当前日期时间戳 9 number = format(time.time() * 100, '.0f') 10 11 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%s' % number 12 datas = json.loads(requests.get(url=url).json()['data']) 13 14 print('更新时间:' + datas['lastUpdateTime']) 15 #写入更新时间 16 17 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile: 18 writer = csv.writer(csvfile) 19 writer.writerow(['更新时间:' + datas['lastUpdateTime']]) 20 21 for contry in datas['areaTree']: 22 if contry['name'] == '中国': 23 for province in contry['children']: 24 print(province['name']) 25 #写入省份名称 26 27 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile: 28 writer = csv.writer(csvfile) 29 writer.writerow([province['name']]) 30 for city in province['children']: 31 print(city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal'])) 32 # 写入市的名称,确诊、死亡、治愈的人数 33 34 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile: 35 writer = csv.writer(csvfile) 36 writer.writerow([city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal'])])
所得数据为
(3)数据分析与可视化
1 import time 2 import json 3 import requests 4 5 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000) 6 7 # 抓取腾讯疫情实时json数据 8 data = json.loads(requests.get(url=url).json()['data']) 9 10 # 数据太多,打印一个键值对 11 print(data['lastUpdateTime']) 12 13 type(data),len(data) 14 15 16 print(data.keys()) 17 18 19 # 统计省份信息 20 num = data['areaTree'][0]['children'] 21 # print(num) 22 print(num[15]) 23 24 # 数据太多,打印部分 25 26 hunan = num[15] 27 28 hunan.keys() 29 30 hunan['total']
1 hunan['children'][0]['total'] # 武汉总数据
1 # 解析每个省份确诊的总人数 2 total_data = {} 3 4 for item in num: 5 6 if item['name'] not in total_data: 7 total_data.update({item['name']:0}) 8 for city_data in item['children']: 9 total_data[item['name']] += int(city_data['total']['confirm']) 10 print(total_data) 11 12 13 print(hunan_children_total_data) 14 15 hb_names = hunan_children_total_data.keys() 16 17 hb_numbers = hunan_children_total_data.values() 18 19 20 import matplotlib.pyplot as plt 21 import numpy as np 22 plt.rcParams['font.sans-serif'] = ['simhei'] 23 # 用来正常显示中文标签 24 25 # 绘图 26 plt.figure(figsize=[12,8]) 27 28 plt.bar(hb_names,hb_numbers) 29 30 plt.xlabel("地区", size=12) 31 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12) 32 plt.title("湖南省不同地区疫情确诊数对比图", size=16) 33 plt.xticks(list(hb_names), rotation=90, size=12) 34 35 plt.show() 36 37 names = total_data.keys() 38 print(names) 39 40 41 numbers = total_data.values() 42 print(numbers) 43 44 45 import matplotlib.pyplot as plt 46 import numpy as np 47 48 plt.rcParams['font.sans-serif'] = ['simhei'] 49 50 # 用来正常显示中文标签 51 52 # 绘图 53 plt.figure(figsize=[12,8]) 54 55 plt.bar(names,numbers) 56 57 plt.xlabel("地区", size=12) 58 plt.ylabel("人数", fontproperties='SimHei',rotation=90 ,size=12) 59 plt.title("中国不同省份疫情确诊数对比图", size=16) 60 plt.xticks(list(names), size=12) 61 62 plt.show()
1 import time, json, requests 2 3 # 抓取腾讯疫情实时json数据 4 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d' % int(time.time() * 1000) 5 data = json.loads(requests.get(url=url).json()['data']) 6 print(data) 7 print(data.keys()) 8 9 # 统计省份信息(34个省份 湖北 广东 河南 浙江 湖南 安徽....) 10 num = data['areaTree'][0]['children'] 11 print(len(num)) 12 for item in num: 13 print(item['name'], end=" ") # 不换行 14 else: 15 print("\n") # 换行 16 17 # 显示湖北省数据 18 hubei = num[23]['children'] 19 for item in hubei: 20 print(item) 21 else: 22 print("\n") 23 24 # 解析确诊数据 25 total_data = {} 26 for item in num: 27 if item['name'] not in total_data: 28 total_data.update({item['name']: 0}) 29 for city_data in item['children']: 30 total_data[item['name']] += int(city_data['total']['confirm']) 31 print(total_data) 32 33 34 # 解析疑似数据 35 total_suspect_data = {} 36 for item in num: 37 if item['name'] not in total_suspect_data: 38 total_suspect_data.update({item['name']: 0}) 39 for city_data in item['children']: 40 total_suspect_data[item['name']] += int(city_data['total']['suspect']) 41 print(total_suspect_data) 42 43 # 解析死亡数据 44 total_dead_data = {} 45 for item in num: 46 if item['name'] not in total_dead_data: 47 total_dead_data.update({item['name']: 0}) 48 for city_data in item['children']: 49 total_dead_data[item['name']] += int(city_data['total']['dead']) 50 print(total_dead_data) 51 52 # 解析治愈数据 53 total_heal_data = {} 54 for item in num: 55 if item['name'] not in total_heal_data: 56 total_heal_data.update({item['name']: 0}) 57 for city_data in item['children']: 58 total_heal_data[item['name']] += int(city_data['total']['heal']) 59 print(total_heal_data) 60 61 # 解析新增确诊数据 62 total_new_data = {} 63 for item in num: 64 if item['name'] not in total_new_data: 65 total_new_data.update({item['name']: 0}) 66 for city_data in item['children']: 67 total_new_data[item['name']] += int(city_data['today']['confirm']) # today 68 print(total_new_data) 69 70 # ------------------------------------------------------------------------------ 71 # 第二步:存储数据至CSV文件 72 # ------------------------------------------------------------------------------ 73 names = list(total_data.keys()) # 省份名称 74 num1 = list(total_data.values()) # 确诊数据 75 num2 = list(total_suspect_data.values()) # 疑似数据(全为0) 76 num3 = list(total_dead_data.values()) # 死亡数据 77 num4 = list(total_heal_data.values()) # 治愈数据 78 num5 = list(total_new_data.values()) # 新增确诊病例 79 print(names) 80 print(num1) 81 print(num2) 82 print(num3) 83 print(num4) 84 print(num5) 85 86 # 获取当前日期命名(2020-12-27-all.csv) 87 n = time.strftime("%Y-%m-%d") + "-all.csv" 88 fw = open(n, 'w', encoding='utf-8') 89 fw.write('province,confirm,dead,heal,new_confirm\n') 90 i = 0 91 while i < len(names): 92 fw.write(names[i] + ',' + str(num1[i]) + ',' + str(num3[i]) + ',' + str(num4[i]) + ',' + str(num5[i]) + '\n') 93 i = i + 1 94 else: 95 print("Over write file!") 96 fw.close()
1 import pandas as pd 2 #导入数据 3 n = time.strftime("%Y-%m-%d") + "-all.csv" 4 data = pd.read_csv(n) 5 df_world = pd.read_csv(n) 6 # 查看数据的简要信息 7 df_world.describe()
1 import matplotlib.pyplot as plt 2 import pandas as pd 3 # 创建画布 4 plt.figure(figsize=(15,10)) 5 #中文字体 6 plt.rcParams['font.family'] = ['SimHei'] 7 #导入数据 8 n = time.strftime("%Y-%m-%d") + "-all.csv" 9 data = pd.read_csv(n) 10 df_world = pd.read_csv(n) 11 df_citi = pd.read_csv(n) 12 labels = df_citi['province'].values 13 data = df_citi['confirm'].values 14 plt.pie(data ,labels=labels, autopct='%1.1f%%',radius=1) 15 #设置显示图像为圆形 16 plt.axis('equal') 17 # 标题 18 plt.title('全国各省新冠疫情比例') 19 plt.show()
总代码:
1 import time, json, requests 2 import csv 3 4 5 #文件名称 6 ExcelName = '2.3疫情日报.csv' 7 8 #当前日期时间戳 9 number = format(time.time() * 100, '.0f') 10 11 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%s' % number 12 datas = json.loads(requests.get(url=url).json()['data']) 13 14 print('更新时间:' + datas['lastUpdateTime']) 15 #写入更新时间 16 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile: 17 writer = csv.writer(csvfile) 18 writer.writerow(['更新时间:' + datas['lastUpdateTime']]) 19 20 for contry in datas['areaTree']: 21 if contry['name'] == '中国': 22 for province in contry['children']: 23 print(province['name']) 24 #写入省份名称 25 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile: 26 writer = csv.writer(csvfile) 27 writer.writerow([province['name']]) 28 for city in province['children']: 29 print(city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal'])) 30 # 写入市的名称,确诊、死亡、治愈的人数 31 with open(ExcelName, 'a', encoding='utf-8', newline='') as csvfile: 32 writer = csv.writer(csvfile) 33 writer.writerow([city['name'], '确诊:' + str(city['total']['confirm']), '死亡:' + str(city['total']['dead']), '治愈:' + str(city['total']['heal'])]) 34 import time 35 import json 36 import requests 37 38 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d'%int(time.time()*1000) 39 40 # 抓取腾讯疫情实时json数据 41 data = json.loads(requests.get(url=url).json()['data']) 42 43 # 数据太多,打印一个键值对 44 print(data['lastUpdateTime']) 45 46 type(data),len(data) 47 48 49 print(data.keys()) 50 51 52 # 统计省份信息 53 num = data['areaTree'][0]['children'] 54 # print(num) 55 print(num[15]) 56 57 # 数据太多,打印部分 58 59 hunan = num[15] 60 61 hunan.keys() 62 63 hunan['total'] 64 # 解析每个省份确诊的总人数 65 total_data = {} 66 67 for item in num: 68 69 if item['name'] not in total_data: 70 total_data.update({item['name']:0}) 71 for city_data in item['children']: 72 total_data[item['name']] += int(city_data['total']['confirm']) 73 print(total_data) 74 75 76 print(hunan_children_total_data) 77 78 hb_names = hunan_children_total_data.keys() 79 80 hb_numbers = hunan_children_total_data.values() 81 82 83 import matplotlib.pyplot as plt 84 import numpy as np 85 plt.rcParams['font.sans-serif'] = ['simhei'] 86 # 用来正常显示中文标签 87 88 # 绘图 89 plt.figure(figsize=[12,8]) 90 91 plt.bar(hb_names,hb_numbers) 92 93 plt.xlabel("地区", size=12) 94 plt.ylabel("人数", fontproperties='SimHei', rotation=90, size=12) 95 plt.title("湖南省不同地区疫情确诊数对比图", size=16) 96 plt.xticks(list(hb_names), rotation=90, size=12) 97 98 plt.show() 99 100 names = total_data.keys() 101 print(names) 102 103 104 numbers = total_data.values() 105 print(numbers) 106 107 108 import matplotlib.pyplot as plt 109 import numpy as np 110 111 plt.rcParams['font.sans-serif'] = ['simhei'] 112 113 # 用来正常显示中文标签 114 115 # 绘图 116 plt.figure(figsize=[12,8]) 117 118 plt.bar(names,numbers) 119 120 plt.xlabel("地区", size=12) 121 plt.ylabel("人数", fontproperties='SimHei',rotation=90 ,size=12) 122 plt.title("中国不同省份疫情确诊数对比图", size=16) 123 plt.xticks(list(names), size=12) 124 125 plt.show() 126 import time, json, requests 127 128 # 抓取腾讯疫情实时json数据 129 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d' % int(time.time() * 1000) 130 data = json.loads(requests.get(url=url).json()['data']) 131 print(data) 132 print(data.keys()) 133 134 # 统计省份信息(34个省份 湖北 广东 河南 浙江 湖南 安徽....) 135 num = data['areaTree'][0]['children'] 136 print(len(num)) 137 for item in num: 138 print(item['name'], end=" ") # 不换行 139 else: 140 print("\n") # 换行 141 142 # 显示湖北省数据 143 hubei = num[23]['children'] 144 for item in hubei: 145 print(item) 146 else: 147 print("\n") 148 149 # 解析确诊数据 150 total_data = {} 151 for item in num: 152 if item['name'] not in total_data: 153 total_data.update({item['name']: 0}) 154 for city_data in item['children']: 155 total_data[item['name']] += int(city_data['total']['confirm']) 156 print(total_data) 157 158 159 # 解析疑似数据 160 total_suspect_data = {} 161 for item in num: 162 if item['name'] not in total_suspect_data: 163 total_suspect_data.update({item['name']: 0}) 164 for city_data in item['children']: 165 total_suspect_data[item['name']] += int(city_data['total']['suspect']) 166 print(total_suspect_data) 167 168 # 解析死亡数据 169 total_dead_data = {} 170 for item in num: 171 if item['name'] not in total_dead_data: 172 total_dead_data.update({item['name']: 0}) 173 for city_data in item['children']: 174 total_dead_data[item['name']] += int(city_data['total']['dead']) 175 print(total_dead_data) 176 177 # 解析治愈数据 178 total_heal_data = {} 179 for item in num: 180 if item['name'] not in total_heal_data: 181 total_heal_data.update({item['name']: 0}) 182 for city_data in item['children']: 183 total_heal_data[item['name']] += int(city_data['total']['heal']) 184 print(total_heal_data) 185 186 # 解析新增确诊数据 187 total_new_data = {} 188 for item in num: 189 if item['name'] not in total_new_data: 190 total_new_data.update({item['name']: 0}) 191 for city_data in item['children']: 192 total_new_data[item['name']] += int(city_data['today']['confirm']) # today 193 print(total_new_data) 194 195 # ------------------------------------------------------------------------------ 196 # 第二步:存储数据至CSV文件 197 # ------------------------------------------------------------------------------ 198 names = list(total_data.keys()) # 省份名称 199 num1 = list(total_data.values()) # 确诊数据 200 num2 = list(total_suspect_data.values()) # 疑似数据(全为0) 201 num3 = list(total_dead_data.values()) # 死亡数据 202 num4 = list(total_heal_data.values()) # 治愈数据 203 num5 = list(total_new_data.values()) # 新增确诊病例 204 print(names) 205 print(num1) 206 print(num2) 207 print(num3) 208 print(num4) 209 print(num5) 210 211 # 获取当前日期命名(2020-12-27-all.csv) 212 n = time.strftime("%Y-%m-%d") + "-all.csv" 213 fw = open(n, 'w', encoding='utf-8') 214 fw.write('province,confirm,dead,heal,new_confirm\n') 215 i = 0 216 while i < len(names): 217 fw.write(names[i] + ',' + str(num1[i]) + ',' + str(num3[i]) + ',' + str(num4[i]) + ',' + str(num5[i]) + '\n') 218 i = i + 1 219 else: 220 print("Over write file!") 221 fw.close() 222 import time, json, requests 223 224 # 抓取腾讯疫情实时json数据 225 url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=&_=%d' % int(time.time() * 1000) 226 data = json.loads(requests.get(url=url).json()['data']) 227 print(data) 228 print(data.keys()) 229 230 # 统计省份信息(34个省份 湖北 广东 河南 浙江 湖南 安徽....) 231 num = data['areaTree'][0]['children'] 232 print(len(num)) 233 for item in num: 234 print(item['name'], end=" ") # 不换行 235 else: 236 print("\n") # 换行 237 238 # 显示湖北省数据 239 hubei = num[23]['children'] 240 for item in hubei: 241 print(item) 242 else: 243 print("\n") 244 245 # 解析确诊数据 246 total_data = {} 247 for item in num: 248 if item['name'] not in total_data: 249 total_data.update({item['name']: 0}) 250 for city_data in item['children']: 251 total_data[item['name']] += int(city_data['total']['confirm']) 252 print(total_data) 253 254 255 # 解析疑似数据 256 total_suspect_data = {} 257 for item in num: 258 if item['name'] not in total_suspect_data: 259 total_suspect_data.update({item['name']: 0}) 260 for city_data in item['children']: 261 total_suspect_data[item['name']] += int(city_data['total']['suspect']) 262 print(total_suspect_data) 263 264 # 解析死亡数据 265 total_dead_data = {} 266 for item in num: 267 if item['name'] not in total_dead_data: 268 total_dead_data.update({item['name']: 0}) 269 for city_data in item['children']: 270 total_dead_data[item['name']] += int(city_data['total']['dead']) 271 print(total_dead_data) 272 273 # 解析治愈数据 274 total_heal_data = {} 275 for item in num: 276 if item['name'] not in total_heal_data: 277 total_heal_data.update({item['name']: 0}) 278 for city_data in item['children']: 279 total_heal_data[item['name']] += int(city_data['total']['heal']) 280 print(total_heal_data) 281 282 # 解析新增确诊数据 283 total_new_data = {} 284 for item in num: 285 if item['name'] not in total_new_data: 286 total_new_data.update({item['name']: 0}) 287 for city_data in item['children']: 288 total_new_data[item['name']] += int(city_data['today']['confirm']) # today 289 print(total_new_data) 290 291 # ------------------------------------------------------------------------------ 292 # 第二步:存储数据至CSV文件 293 # ------------------------------------------------------------------------------ 294 names = list(total_data.keys()) # 省份名称 295 num1 = list(total_data.values()) # 确诊数据 296 num2 = list(total_suspect_data.values()) # 疑似数据(全为0) 297 num3 = list(total_dead_data.values()) # 死亡数据 298 num4 = list(total_heal_data.values()) # 治愈数据 299 num5 = list(total_new_data.values()) # 新增确诊病例 300 print(names) 301 print(num1) 302 print(num2) 303 print(num3) 304 print(num4) 305 print(num5) 306 307 # 获取当前日期命名(2020-12-27-all.csv) 308 n = time.strftime("%Y-%m-%d") + "-all.csv" 309 fw = open(n, 'w', encoding='utf-8') 310 fw.write('province,confirm,dead,heal,new_confirm\n') 311 i = 0 312 while i < len(names): 313 fw.write(names[i] + ',' + str(num1[i]) + ',' + str(num3[i]) + ',' + str(num4[i]) + ',' + str(num5[i]) + '\n') 314 i = i + 1 315 else: 316 print("Over write file!") 317 fw.close() 318 import pandas as pd 319 #导入数据 320 n = time.strftime("%Y-%m-%d") + "-all.csv" 321 data = pd.read_csv(n) 322 df_world = pd.read_csv(n) 323 # 查看数据的简要信息 324 df_world.describe() 325 import matplotlib.pyplot as plt 326 import pandas as pd 327 # 创建画布 328 plt.figure(figsize=(15,10)) 329 #中文字体 330 plt.rcParams['font.family'] = ['SimHei'] 331 #导入数据 332 n = time.strftime("%Y-%m-%d") + "-all.csv" 333 data = pd.read_csv(n) 334 df_world = pd.read_csv(n) 335 df_citi = pd.read_csv(n) 336 labels = df_citi['province'].values 337 data = df_citi['confirm'].values 338 plt.pie(data ,labels=labels, autopct='%1.1f%%',radius=1) 339 #设置显示图像为圆形 340 plt.axis('equal') 341 # 标题 342 plt.title('全国各省新冠疫情比例') 343 plt.show()
5、总结
我从这次通过网络爬虫直观地观察到现在中国疫情的严重程度。
从本次学习中我有了很大收获,同时也发现了自己的很多问题,比如对json的格式的不标准,需要学习更多的python知识,为以后的就业打好基础。