scrapy-redis

 

 

不使用scrapy-redis的:

最初始的方法:

 

settings里面:

 

# DUPEFILTER_CLASS = 'scrapy_pro1.dupfliter.DupFilter'

 

 

from   scrapy.dupefilter import  BaseDupeFilter
from scrapy.utils.request import request_fingerprint##加密
import redis
class DupFliter(BaseDupeFilter):
def __init__(self):
self.dupdic=set()
@classmethod##名字不能更改
def from_settings(cls, settings):
return cls()
def request_seen(self,request):
url=request_fingerprint(request)
if url in self.dupdic:
return True
self.dupdic.add(url)
print('添加成功',request.url)

配置了redis的去重方法:(通过集合sadd是否可以添加进去判断存不存在

 

settings里面:

# DUPEFILTER_CLASS = 'scrapy_pro1.dupfliter.DupFilter'

去重类:
from   scrapy.dupefilter import  BaseDupeFilter
from scrapy.utils.request import request_fingerprint##加密
import redis
class DupFliter(BaseDupeFilter):
def __init__(self):
self.dupdic=redis.Redis(host='127.0.0.1',port=6379)
def request_seen(self,request):
url=request_fingerprint(request)
print(url)
print('执行去重')
exist=self.dupdic.sadd('scrapy_urls2', url) ##做判断,看是否是相等的
if exist==0:
print('已经存在', request.url)
return True
print('添加成功', request.url)
return False
##如果是添加成功的haul,那么久返回fasle,不做处理,如果返回true就会执行下一个yield




使用scrapy-redis的:

第一种去重方法:
原生的scrapy-redis去重:

settings里面:

DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'##自带的去重类




第二种去重方法:
(自定义部分,由于使用原生的scrapy-redis保存的记录会在执行完之后自动清除,而且key也是时间戳来保存的
所以继承
RFPDupeFilter类,弥补了上面的缺陷
settings配置:
REDIS_HOST = 'localhost'                            
REDIS_PORT = 6379
# REDIS_PARAMS = {'password':'beta'} #Redis连接参数,默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
REDIS_ENCODING = "utf-8"

# REDIS_URL = 'redis://user:pass@hostname:6379' #连接URL(优先于以上配置)
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
DUPEFILTER_CLASS = 'scrapy_pro1.dupfliter.RedisDupeFilter'##自带的去重类



去重类:

from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults
class RedisDupeFilter(RFPDupeFilter):
@classmethod
def from_settings(cls, settings):
server = get_redis_from_settings(settings)
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {'timestamp': '记录key'}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug)

 

posted @ 2018-11-13 04:13  风不再来  阅读(261)  评论(0编辑  收藏  举报