Python爬虫爬取疫情数据

Python爬虫爬取疫情数据

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/8/12 12:06 上午
# @Author  : Helius
# @File    : 04-corona_virus.py


import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm import tqdm


class CoronaVirusSpider(object):
    def __init__(self):
        self.home_url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia'

    def get_content_from_url(self, url):
        """
        根据url ,获取响应内容的字符串数据
        :param url: 请求的url
        :return:
        """
        response = requests.get(url)
        return response.content.decode('utf-8')

    def parse_home_page(self, home_page,tag_id):
        """
        解析首页内容,获取解析后的Python数据
        :param home_page:
        :return:
        """
        soup = BeautifulSoup(home_page, 'lxml')
        script = soup.find(id=tag_id)
        text = script.string
        json_str = re.findall(r'\[.+\]', text)[0]
        data = json.loads(json_str)
        return data

    def save(self, data, path):
        with open(path, 'w') as fp:
            json.dump(data, fp, ensure_ascii=False)

    def crawl_last_day_corona_virus(self):
        """
        采集最近一天的各国疫情数据
        :return:
        """
        home_page = self.get_content_from_url(self.home_url)
        last_day_corona_virus = self.parse_home_page(home_page,'getListByCountryTypeService2true')
        self.save(last_day_corona_virus, 'data/last_day_corona_virus.json')

    def crawl_corona_virus(self):
        """
        采集从1月23号以来各国疫情数据
        :return:
        """
        with open('data/last_day_corona_virus.json') as fp:
            last_day_corona_virus = json.load(fp)
        corona_virus = self.corona_virus_data(last_day_corona_virus,desc='采集1月23日以来各国疫情信息')
        self.save(corona_virus,'data/corona_virus.json')

    def crawl_last_day_corona_virus_of_china(self):
        """
        采集最近一日我国各省一日疫情数据
        :return: 
        """
        home_page = self.get_content_from_url(self.home_url)
        last_day_corona_virus_of_china = self.parse_home_page(home_page,tag_id='getAreaStat')
        self.save(last_day_corona_virus_of_china, 'data/last_day_corona_virus_of_china.json')

    def crawl_corona_virus_of_china(self):
        """
        采集我国自1月22日以来各省疫情数据
        :return:
        """
        with open('data/last_day_corona_virus_of_china.json') as fp:
            last_day_corona_virus = json.load(fp)
        corona_virus = self.corona_virus_data(last_day_corona_virus,'采集1月22日以来我国各省疫情信息')
        self.save(corona_virus,'data/corona_virus_of_china.json')

    def corona_virus_data(self, last_day_corona_virus,desc):
        corona_virus = []
        for country in tqdm(last_day_corona_virus, desc):
            statistics_data_url = country['statisticsData']
            statistics_data_json_str = self.get_content_from_url(statistics_data_url)
            statistics_data = json.loads(statistics_data_json_str)['data']
            for one_day in statistics_data:
                one_day['provinceName'] = country['provinceName']
                if country.get('countryShortCode'):
                    one_day['countryShortCode'] = country['countryShortCode']
            corona_virus.extend(statistics_data)
        return corona_virus

    def run(self):
        # self.crawl_last_day_corona_virus()
        # self.crawl_corona_virus()
        # self.crawl_last_day_corona_virus_of_china()
        self.crawl_corona_virus_of_china()

if __name__ == '__main__':
    spider = CoronaVirusSpider()
    spider.run()


🔥

小结:整体比较简单,就当复习下啦

posted @ 2020-08-13 00:00  HeliusKing  阅读(854)  评论(0编辑  收藏  举报