爬取天气状况信息

使用自动爬取工具selenium,进行爬虫练习,从http://pm25.in/上爬取所需数据

#!/usr/bin/env python
# encoding: utf-8
from selenium import webdriver
from lxml import etree
import time
import csv

aqi = []
pm2 = []
pm10 = []
co = []
no2 = []
o3 = []
o38 = []
so2 = []

class pm25pider():
    driver_path = 'F:/学习/大三上/爬虫/动态/chromedriver.exe'
    count=0


    def __init__(self):
        self.driver = webdriver.Chrome(self.driver_path)
        self.url="http://pm25.in/"
        self.positions=[]
    def run(self):
        #打开初始URL
        self.driver.get(self.url)
        # while True:
            #获取网页源代码
        source=self.driver.page_source
        #     #解析列表页
        self.parse_list_page(source=source)
        #     #定位 下一页 按钮
        # self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(2)
    # 解析列表页
    def parse_list_page(self,source):
        html=etree.HTML(source)
        # 获取职位列表的链接
        links=html.xpath("//div[@class='all']//a/@href")
        count=0
        for l in links:
            # print(link)
            link='http://pm25.in'+l
            print(link)
            self.request_detail_page(link)
            time.sleep(1)
    def request_detail_page(self,url):
        # self.driver.get(url)
        self.driver.execute_script("window.open('%s')"%url)  #%s用url代替
        self.driver.switch_to.window(self.driver.window_handles[1])# 保留两个窗口,第一个是列表页,第二个是详情页

        source=self.driver.page_source
        # 解析详情页
        self.parse_detail_page(source=source)
        #    关闭这个详情页

        self.driver.close()
        #    切换回职位列表页
        self.driver.switch_to.window(self.driver.window_handles[0])

    # 解析详情页

    def parse_detail_page(self,source):
        html=etree.HTML(source)
        q = html.xpath("//div[@class='value']/text()")
        n=html.xpath("//div[@class='city_name']/h2/text()")
        # print(q)
        all=''.join(q).split()
        aqi=all[0]
        pm2=all[1]
        pm10=all[2]
        co=all[3]
        no2=all[4]
        o3=all[5]
        o38=all[6]
        so2=all[7]
        position={
            'name':n[0],
            'aqi':aqi,
            'pm2':pm2,
            'pm10':pm10,
            'co':co,
            'no2':no2,
            'o30':o3,
            'o38':o38,
            'so2':so2,
        }
        self.positions.append(position)
        print(position)
        print('='*40)
        self.count+=1
        print(self.count)

        #将爬取的结果写入文件
        self.write_csv(positions=self.positions)

    def write_csv(self, positions):
        headers = ['name','aqi','pm2','pm10','co','no2','o30','o38','so2']
        with open('china_city_aqi.csv', 'w', encoding='utf-8', newline='') as fp:
            writer = csv.DictWriter(fp, headers)
            writer.writeheader()
            writer.writerows(positions)
if __name__=='__main__':

    spider=pm25pider()
    spider.run()

结果:

 

 



posted @ 2020-10-21 20:38  筱艺  阅读(115)  评论(0)    收藏  举报