学习进度16
今天学历Scrapy的使用,写了一个那个首都之窗的爬虫,但是我把爬虫定义了一秒爬一次,我算了算,爬3000*7/60/60=5个小时。
爬虫代码:
# -*- coding: utf-8 -*- import scrapy import re from lxml import html from a1.items import A1Item class SdzcSpider(scrapy.Spider): num = 61 name = 'sdzc' allowed_domains = ['beijing.gov.cn'] start_urls = ['http://www.beijing.gov.cn/hudong/hdjl/com.web.search.replyMailList.flow?PageCond'] start = "http://www.beijing.gov.cn/hudong/hdjl/com.web.search.replyMailList.flow?PageCond" # 咨询 basic_consult_url = "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=" # 建议 basic_suggest_url = "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId=" def parse(self, response): # print(response.text) # 真正要爬取的网页解析出来 text = response.text methods = re.findall("letterdetail\(\'([\S]+?)\'\,", text, re.DOTALL) url_ids = re.findall("letterdetail\(\'[\S]+\'\,\'(\S+?)\'\)", text, re.DOTALL) print("####################") i = 0 for i in range(len(url_ids) - 1): url = self.merge_url(url_ids[i], methods[i]) print(url) yield scrapy.Request(url, callback=self.pq_url) i += 1 # print("-----------") print(self.num) print("####################") if self.num < 3000: n = self.num*6 next_url = self.start + "/begin=%s" % n self.num += 1 yield scrapy.Request(next_url, callback=self.parse) else: return # 组合url def merge_url(self, url_id, methods): if methods == "咨询": url = self.basic_consult_url + url_id elif methods == "建议" or "问答": url = self.basic_suggest_url + url_id else: url = "" return url # 来对爬到的url二次爬取 获取想要的信息 item里有原型 def pq_url(self, response): print(response.url) print("pq运行") text = response.text etree = html.etree this_html = etree.HTML(text) # print(type(text)) # 问方 col-xs-10 col-sm-10 col-md-10 o-font4 my-2 title = re.findall("<div class=\"col-xs-10 col-sm-10 col-md-10 o-font4 my-2\"><strong>(.+?)</strong></div>", text, re.DOTALL) time = re.findall("<div class=\"col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted \">[\S]+\:([\S]+?)</div>", text, re.DOTALL) # print(title) div1 = this_html.xpath("//div[@class='col-xs-12 col-md-12 column p-2 text-muted mx-2']//text()") d = "".join(div1) content = d.strip() # 回答方 department = re.findall("<div class=\"col-xs-9 col-sm-7 col-md-5 o-font4 my-2\">.+<strong>.+</strong>([\S]+).+</div>", text, re.DOTALL) # <div class=\"col-xs-12 col-sm-3 col-md-3 my-2 \">[\S]+\:(.+?)</div> reply_time = re.findall("<div class=\"col-xs-12 col-sm-3 col-md-3 my-2.{0,1}\">[\S]+\:(.+?)</div>", text, re.DOTALL) div2 = this_html.xpath("//div[@class='col-xs-12 col-md-12 column p-4 text-muted my-3']//text()") d = "".join(div2) reply_content = d.strip() item = A1Item(title=title, time=time,content=content, department=department, reply_time=reply_time, reply_content=reply_content) yield item
存贮的文件:
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import json class A1Pipeline(object): def __init__(self): self.fp = open("b1.json", "w", encoding='utf-8') print("开始爬") def process_item(self, item, spider): item_json = json.dumps(dict(item), ensure_ascii=False) self.fp.write(item_json+'\n') return item def close_spider(self, spider): self.fp.close()
浙公网安备 33010602011771号