scrapy_redis组件去重掉url
settings.py
# 去重类
# DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' scrapy_redis组件默认的自带的的类
DUPEFILTER_CLASS = 'xdb.dupefilter.RedisDupeFilter' # 实际上也是继承了上面的类 ,修改了DUPEFILTER_KEY值
############## scrapy_redis连接 ###############
# 方式一
REDIS_HOST = 'localhost' # 主机名
REDIS_PORT = 6379 # 端口
REDIS_PARAMS = {} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
# REDIS_PARAMS['redis_cls'] = 'myproject.RedisClient' # 指定连接Redis的Python模块 默认:redis.StrictRedis
REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8'
# 方式二
REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置)
# redis中key的名称:
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
自定义redis中的key,变成固定key
dupefilters.py # 自定义文件名
from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults
class RedisDupeFilter(RFPDupeFilter):
@classmethod
def from_settings(cls, settings):
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
# key = defaults.DUPEFILTER_KEY % {'timestamp': int(time.time())}
key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'} # 把随机redis中的key值(int(time.time())替换成固定值了
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug)