scrapy pipeline

pipeline的四个方法

@classmethod
def from_crawler(cls, crawler):
    """
    初始化的时候,用以创建pipeline对象
    :param crawler:
    :return:
    """
    pass


def open_spider(self, spider):
    """
    爬虫开始执行时,调用
    :param spider:
    :return:
    """
    pass


def process_item(self, item, spider):
    """
          每当数据需要持久化时,就会被调用
          :param item:
          :param spider:
          :return:
    """
    
    return item


def close_spider(self, spider):
    """
        爬虫结束执行时,调用
        :param spider:
        :return:
    """
    pass

实例

import pymysql
from scrapy.exceptions import DropItem


class ChoutiPipeline(object):
    def __init__(self, db_conf):
        self.db_conf = db_conf
        self.conn = None
        self.cursor = None

    @classmethod
    def from_crawler(cls, crawler):
        """
        初始化的时候,用以创建pipeline对象
        :param crawler:
        :return:
        """
        db_conf = crawler.settings.get('DATABASE')
        return cls(db_conf)

    def open_spider(self, spider):
        """
        爬虫开始执行时,调用
        :param spider:
        :return:
        """
        print('爬虫开始 ...')
        self.conn = pymysql.connect(
            host=self.db_conf['host'],
            port=self.db_conf['port'],
            user=self.db_conf['user'],
            passwd=self.db_conf['password'],
            db=self.db_conf['db'],
            charset=self.db_conf['charset']
        )
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        """
              每当数据需要持久化时,就会被调用
              :param item:
              :param spider:
              :return:
        """

        sql = 'INSERT INTO articles(title, title_url, summary, create_time, url_md5)' \
              ' VALUES ("%s", "%s" ,"%s", "%s", "%s")'

        a = sql % (item['title'], item['title_url'], item['summary'], item['create_time'], item['url_md5'])

        try:
            self.cursor.execute(a)
            self.conn.commit()
        except Exception as e:
            print(e)
        return DropItem()

    def close_spider(self, spider):
        """
            爬虫结束执行时,调用
            :param spider:
            :return:
        """
        self.cursor.close()
        self.conn.close()
        print('爬虫结束 ...')

注册配置文件

全局配置:

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'day1.pipelines.ChoutiPipeline': 300,
}

也可以控制某个爬虫执行那行那个pipeline

class ChoutiSpider(scrapy.Spider):
    name = 'Chouti'
    allowed_domains = ['dig.chouti.com']
    start_urls = ['https://dig.chouti.com/']

    custom_settings = {
        'ITEM_PIPELINES': {'day1.pipelines.ChoutiPipeline': 1}
    }

 

posted @ 2018-07-14 10:28  20180616  阅读(201)  评论(0编辑  收藏  举报