from __future__ import absolute_import
from copy import deepcopy
from scrapy.utils.request import request_fingerprint
from scrapy.utils.url import canonicalize_url
from scrapy_splash.utils import dict_hash
from scrapy_redis.dupefilter import RFPDupeFilter
def splash_request_fingerprint(request, include_headers=None):
fp = request_fingerprint(request, include_headers=include_headers)
if 'splash' not in request.meta:
return fp
splash_options = deepcopy(request.meta['splash'])
args = splash_options.setdefault('args', {})
if 'url' in args:
args['url'] = canonicalize_url(args['url'], keep_fragments=True)
return dict_hash(splash_options, fp)
class SplashAwareDupeFilter(RFPDupeFilter):
def request_fingerprint(self, request):
return splash_request_fingerprint(request)
# 在项目目录下新建一个py文件,将上面的代码复制进去,在settings中添加 DUPEFILTER_CLASS = '项目名.xxx.py.SplashAwareDupeFilter' ,并且把scrapy-redis和scrapy-splash的
DUPEFILTER_CLASS删掉,其他设置并不冲突,可以不改。