爬取天气状况信息
使用自动爬取工具selenium,进行爬虫练习,从http://pm25.in/上爬取所需数据
#!/usr/bin/env python # encoding: utf-8 from selenium import webdriver from lxml import etree import time import csv aqi = [] pm2 = [] pm10 = [] co = [] no2 = [] o3 = [] o38 = [] so2 = [] class pm25pider(): driver_path = 'F:/学习/大三上/爬虫/动态/chromedriver.exe' count=0 def __init__(self): self.driver = webdriver.Chrome(self.driver_path) self.url="http://pm25.in/" self.positions=[] def run(self): #打开初始URL self.driver.get(self.url) # while True: #获取网页源代码 source=self.driver.page_source # #解析列表页 self.parse_list_page(source=source) # #定位 下一页 按钮 # self.driver.execute_script('window.scrollTo(0,document.body.scrollHeight)') time.sleep(2) # 解析列表页 def parse_list_page(self,source): html=etree.HTML(source) # 获取职位列表的链接 links=html.xpath("//div[@class='all']//a/@href") count=0 for l in links: # print(link) link='http://pm25.in'+l print(link) self.request_detail_page(link) time.sleep(1) def request_detail_page(self,url): # self.driver.get(url) self.driver.execute_script("window.open('%s')"%url) #%s用url代替 self.driver.switch_to.window(self.driver.window_handles[1])# 保留两个窗口,第一个是列表页,第二个是详情页 source=self.driver.page_source # 解析详情页 self.parse_detail_page(source=source) # 关闭这个详情页 self.driver.close() # 切换回职位列表页 self.driver.switch_to.window(self.driver.window_handles[0]) # 解析详情页 def parse_detail_page(self,source): html=etree.HTML(source) q = html.xpath("//div[@class='value']/text()") n=html.xpath("//div[@class='city_name']/h2/text()") # print(q) all=''.join(q).split() aqi=all[0] pm2=all[1] pm10=all[2] co=all[3] no2=all[4] o3=all[5] o38=all[6] so2=all[7] position={ 'name':n[0], 'aqi':aqi, 'pm2':pm2, 'pm10':pm10, 'co':co, 'no2':no2, 'o30':o3, 'o38':o38, 'so2':so2, } self.positions.append(position) print(position) print('='*40) self.count+=1 print(self.count) #将爬取的结果写入文件 self.write_csv(positions=self.positions) def write_csv(self, positions): headers = ['name','aqi','pm2','pm10','co','no2','o30','o38','so2'] with open('china_city_aqi.csv', 'w', encoding='utf-8', newline='') as fp: writer = csv.DictWriter(fp, headers) writer.writeheader() writer.writerows(positions) if __name__=='__main__': spider=pm25pider() spider.run()
结果:


浙公网安备 33010602011771号