学习进度16

今天学历Scrapy的使用,写了一个那个首都之窗的爬虫,但是我把爬虫定义了一秒爬一次,我算了算,爬3000*7/60/60=5个小时。

爬虫代码:

# -*- coding: utf-8 -*-
import scrapy
import re
from lxml import html


from a1.items import A1Item


class SdzcSpider(scrapy.Spider):
    num = 61
    name = 'sdzc'
    allowed_domains = ['beijing.gov.cn']
    start_urls = ['http://www.beijing.gov.cn/hudong/hdjl/com.web.search.replyMailList.flow?PageCond']
    start = "http://www.beijing.gov.cn/hudong/hdjl/com.web.search.replyMailList.flow?PageCond"
    # 咨询
    basic_consult_url = "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="
    # 建议
    basic_suggest_url = "http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="

    def parse(self, response):
        # print(response.text)
        # 真正要爬取的网页解析出来
        text = response.text
        methods = re.findall("letterdetail\(\'([\S]+?)\'\,", text, re.DOTALL)
        url_ids = re.findall("letterdetail\(\'[\S]+\'\,\'(\S+?)\'\)", text, re.DOTALL)
        print("####################")
        i = 0
        for i in range(len(url_ids) - 1):
            url = self.merge_url(url_ids[i], methods[i])
            print(url)
            yield scrapy.Request(url, callback=self.pq_url)
            i += 1
            # print("-----------")
        print(self.num)
        print("####################")
        if self.num < 3000:
            n = self.num*6
            next_url = self.start + "/begin=%s" % n
            self.num += 1
            yield scrapy.Request(next_url, callback=self.parse)
        else:
            return

    # 组合url
    def merge_url(self, url_id, methods):
        if methods == "咨询":
            url = self.basic_consult_url + url_id
        elif methods == "建议" or "问答":
            url = self.basic_suggest_url + url_id
        else:
            url = ""
        return url

    # 来对爬到的url二次爬取 获取想要的信息 item里有原型
    def pq_url(self, response):
        print(response.url)
        print("pq运行")
        text = response.text
        etree = html.etree
        this_html = etree.HTML(text)
        # print(type(text))
        # 问方 col-xs-10 col-sm-10 col-md-10 o-font4 my-2
        title = re.findall("<div class=\"col-xs-10 col-sm-10 col-md-10 o-font4 my-2\"><strong>(.+?)</strong></div>", text, re.DOTALL)
        time = re.findall("<div class=\"col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted \">[\S]+\:([\S]+?)</div>", text, re.DOTALL)
        # print(title)
        div1 = this_html.xpath("//div[@class='col-xs-12 col-md-12 column p-2 text-muted mx-2']//text()")
        d = "".join(div1)
        content = d.strip()
        # 回答方
        department = re.findall("<div class=\"col-xs-9 col-sm-7 col-md-5 o-font4 my-2\">.+<strong>.+</strong>([\S]+).+</div>", text, re.DOTALL)
        # <div class=\"col-xs-12 col-sm-3 col-md-3 my-2 \">[\S]+\:(.+?)</div>
        reply_time = re.findall("<div class=\"col-xs-12 col-sm-3 col-md-3 my-2.{0,1}\">[\S]+\:(.+?)</div>", text, re.DOTALL)
        div2 = this_html.xpath("//div[@class='col-xs-12 col-md-12 column p-4 text-muted my-3']//text()")
        d = "".join(div2)
        reply_content = d.strip()
        item = A1Item(title=title, time=time,content=content, department=department, reply_time=reply_time, reply_content=reply_content)
        yield item

存贮的文件:

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json


class A1Pipeline(object):

    def __init__(self):
        self.fp = open("b1.json", "w", encoding='utf-8')
        print("开始爬")

    def process_item(self, item, spider):
        item_json = json.dumps(dict(item), ensure_ascii=False)
        self.fp.write(item_json+'\n')
        return item

    def close_spider(self, spider):
        self.fp.close()

遇到了很多,大部分源于对网页的不明确了解而理所当然,还有一开始没有一个计划,导致写的爬虫丢了一个重要参数——提问的类型(建议,问答,咨询)。

posted @ 2020-02-19 21:04  Abandoned_Software  阅读(119)  评论(0)    收藏  举报