scrapy_redis组件url去重 (有部分自定义)

scrapy_redis组件去重掉url
    settings.py
        # 去重类
        # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'   scrapy_redis组件默认的自带的的类
        DUPEFILTER_CLASS = 'xdb.dupefilter.RedisDupeFilter'   # 实际上也是继承了上面的类 ,修改了DUPEFILTER_KEY值


        ############## scrapy_redis连接 ###############
        # 方式一
        REDIS_HOST = 'localhost'                            # 主机名
        REDIS_PORT = 6379                                   # 端口
        REDIS_PARAMS = {}                                  # Redis连接参数             默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
        # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块  默认:redis.StrictRedis
        REDIS_ENCODING = "utf-8"                            # redis编码类型             默认:'utf-8'

        # 方式二
        REDIS_URL = 'redis://user:pass@hostname:9001'       # 连接URL(优先于以上配置)

        # redis中key的名称:
        DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
        
    自定义redis中的key,变成固定key
    dupefilters.py   # 自定义文件名    
        from scrapy_redis.dupefilter import RFPDupeFilter
        from scrapy_redis.connection import get_redis_from_settings
        from scrapy_redis import defaults
        class RedisDupeFilter(RFPDupeFilter):
            @classmethod
            def from_settings(cls, settings):
                server = get_redis_from_settings(settings)
                # XXX: This creates one-time key. needed to support to use this
                # class as standalone dupefilter with scrapy's default scheduler
                # if scrapy passes spider on open() method this wouldn't be needed
                # TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
                # key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
                key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}   # 把随机redis中的key值(int(time.time())替换成固定值了
                debug = settings.getbool('DUPEFILTER_DEBUG')
                return cls(server, key=key, debug=debug)

 

posted @ 2020-06-14 23:22  高汤  阅读(510)  评论(0编辑  收藏  举报