05 pipeline 管道

管道接收怕从爬取到的item, 并对item进行过滤, 去重, 清洗等工作, 然后对item进行持久化存储

 

去重管道:
  如果图片名称相同, 我们就认为图片重复, 删除该项目

from scrapy.exceptions import DropItem
class
DuplicatePipeline(object): def __init__(self): self.fingerprints = set() # 我们用set作为一个数据指纹, def process_item(self, item, spider): if item['picname'] in self.fingerprints: raise DropItem self.fingerprints.add(item['picname']) return item

 

# 将数据存储到csv文件中

import csv

class CsvFeedPipeline(object):
    def __init__(self):
        self.fp = open('data.csv','a',encoding='utf8')
        filednames = ['classname', 'picname', 'picurl']
        self.writer = csv.DictWriter(self.fp, fieldnames = fieldnames)
        self.writer.writeheader()
    
    def process_item(self, item, spider):
        self.writer.writerow( dict(item) )
        return item
       
    def close_spider(self, spider):
        self.fp.close()

# 过滤, 删除掉与游戏相关的新闻内容

class BlockGamePipeline(object):
    def process_item( self, item, spider ):
        filter_key = '游戏'
        if filter_key in (item['title']).encode('utf8'):
            raise DropItem
        return item

# 加工性管道:

class ProductPricePipeline():
    def process_item(self, item, spider):
        item['total'] = float( item['price'] ) * float( item['count'] )
        return item

# 存储型管道

class json 
class JsonFeedPipeline:
    def __init__(self):
        self.json_file = open('feed.json','wt')
        self.json_file.write('[\n')
    
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + ',\n'
        self.json_file.write(line)
    
    def close_spider(self, spider):
        self.json_file.write( '\n]' )
        self.json_file.close()

 

    

 

posted @ 2019-10-17 17:27  眼镜儿  阅读(127)  评论(0编辑  收藏  举报