Python爬取全国历史天气数据

1、通过爬取历史首页,来获取城市地址和历史时间,构建链接;

'''
获取全国的城市名称和链接
'''

import requests
from lxml import etree
import random
import pymongo
from time_list import get_time

client = pymongo.MongoClient('localhost',27017)
tianqi_data = client['tianqi_data']
time_url_table = tianqi_data['time_url_table']

headers_data = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
]
headers = {
    'User-Agent':random.choice(headers_data)
}

def get_cityname(url):     #爬取城市名称,并保存到数据到列表中
    city_name_list  = []
    city_response = requests.get(url,headers = headers)
    city_response.encoding = city_response.apparent_encoding
    city_names = etree.HTML(city_response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/text()')
    city_links = etree.HTML(city_response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/@href')
    for city_name,city_link in zip(city_names,city_links):
        if len(city_name) == 1:
            pass
        else:
            city_data = {
                'city_name':str(city_name),
                'city_link':str(city_link),
            }
            city_name_list.append(city_data)
    return city_name_list
    #print(city_name_list)
    print('获取城市名称和链接结束...')

url = 'http://lishi.tianqi.com/'
for link in get_cityname(url):      #构建每个城市的历史日期链接,并保存到数据库中
    url = link['city_link']
    for time_link in get_time():
        time = time_link.split('/')[-1].split('.')[0]
        time_url = url.replace('index',str(time))
        data = {
            'time_url':time_url,
            'city':link['city_name'],
        }
        print(data)
        time_url_table.insert(data)
print('导入数据库存完成')
View Code
import requests
from lxml import etree

'''
通过对比城市的链接和历史时间的链接发现,就是在把城市链接里面的index换成了相对应的时间,
所以只要把index换成了历史月份就可以了
'''

def get_time():
    url = 'http://lishi.tianqi.com/acheng/index.html'
    response = requests.get(url)
    time_lists = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/@href')
    return time_lists
View Code

2、从数据库中读取数据,爬取每个城市的历史天气数据;

import requests
from lxml import etree
import random
import pymongo

client = pymongo.MongoClient('localhost',27017)
tianqi_data = client['tianqi_data']
time_url_table = tianqi_data['time_url_table']
tianqi_data_table = tianqi_data['tianqi_data_table']

headers_data = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
]
headers = {
    'User-Agent':random.choice(headers_data)
}

def get_tianqi_data():
    for link in time_url_table.find():
        url = link['time_url']
        print(url)
        response = requests.get(url,headers=headers)
        dates = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li/a/text()')
        max_temps = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[2]/text()')[1:-1]
        low_temps = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[3]/text()')[1:-1]
        weathers = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[4]/text()')[1:-1]
        fengxiangs = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[5]/text()')[1:-1]
        fenglis = etree.HTML(response.text).xpath('//*[@id="tool_site"]/div[2]/ul/li[6]/text()')[1:-1]
        for date,max_temp,low_temp,weather,fengxiang,fengli in zip(dates,max_temps,low_temps,weathers,fengxiangs,fenglis):
            data = {
                '日期':date,
                '最高温度':max_temp,
                '最低温度':low_temp,
                '天气':weather,
                '风向':fengxiang,
                '风力':fengli,
            }
            tianqi_data_table.insert(data)
            print(data)
    print('爬取数据成功')
View Code

 

posted @ 2017-06-05 22:44  睚一  阅读(1084)  评论(0编辑  收藏  举报