爬虫框架scrapy
http://www.cnblogs.com/linhaifeng/articles/7811861.html



为了给爬取到的内容进行持久化,就要把内容保存数据库中,或者是缓存中
下一步就要开始做数据的持久化




pipelines里面数据的写入
return item 表示当前pipline以及处理完了,要交给后续的pipline继续执行。





# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.exceptions import DropItem class DBPipeline(object): """ 持久化到数据 """ def process_item(self, item, spider): return item class CachePipeline(object): """ 放入缓存 """ def process_item(self, item, spider): return item class FilePipeline(object): """ 放入缓存 """ def __init__(self,path): self.file_path = path @classmethod def from_crawler(cls, crawler): """ 初始化时候,用于创建pipeline对象 :param crawler: :return: """ file_path = crawler.settings.get('FILE_PATH') return cls(file_path) def process_item(self, item, spider): # 操作并进行持久化 self.f.write(item['url'] + '\n') self.f.flush() # return表示会被后续的pipeline继续处理 return item # 表示将item丢弃,不会被后续pipeline处理 # raise DropItem() def open_spider(self, spider): """ 爬虫开始执行时,调用 :param spider: :return: """ self.f = open(self.file_path, 'w+', encoding='utf-8') def close_spider(self, spider): """ 爬虫关闭时,被调用 :param spider: :return: """ self.f.close() """ if hasattr(FilePipeline,'from_crawler'): obj = FilePipeline.from_crawler() else: obj = FilePipeline() """

浙公网安备 33010602011771号