scrapy_redis组件url去重 (有部分自定义)
scrapy_redis组件去重掉url settings.py # 去重类 # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' scrapy_redis组件默认的自带的的类 DUPEFILTER_CLASS = 'xdb.dupefilter.RedisDupeFilter' # 实际上也是继承了上面的类 ,修改了DUPEFILTER_KEY值 ############## scrapy_redis连接 ############### # 方式一 REDIS_HOST = 'localhost' # 主机名 REDIS_PORT = 6379 # 端口 REDIS_PARAMS = {} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) # REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块 默认:redis.StrictRedis REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8' # 方式二 REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置) # redis中key的名称: DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' 自定义redis中的key,变成固定key dupefilters.py # 自定义文件名 from scrapy_redis.dupefilter import RFPDupeFilter from scrapy_redis.connection import get_redis_from_settings from scrapy_redis import defaults class RedisDupeFilter(RFPDupeFilter): @classmethod def from_settings(cls, settings): server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. # key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())} key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'} # 把随机redis中的key值(int(time.time())替换成固定值了 debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)