1 """
2 配置 redis
3 安装 pip3 install scrapy-redis
4 修改scrapy项目(先正常实现scrapy爬虫):
5 """
6
7 # ----1 导入分布式爬虫类
8 from scrapy_ redis.spiders import RedisSpider, RedisCrawlSpider
9 # ----2 继承分布式爬虫类
10 class BookSpider(RedisSpider):
11 # ----3. 初始的 start_urls 改为 redis_key。
12 redis_key = 'key'
13 # ----4. settings中 修改调度器类和去重类
14 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
15 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
16 SCHEDULER_PERSIST = True
17 # ----5. 在settings.py文件中配置Redis
18 REDIS_URL = redis://:[password]@host:port
19
20 -----------------------------------------------------------------------
21 # ----1 导入分布式爬虫类
22 from scrapy_ redis.spiders import RedisSpider, RedisCrawlSpider
23 # ----2 继承分布式爬虫类
24 class BookSpider(RedisSpider):
25
26 # ----3 注销start_url & allowed_domains
27 # ----4 设置redis_key
28 redis_key = 'key'
29 # ----5 设置__init__
30 def __init__(self, *args, **kwargs):
31 domain = kwargs.pop('domain', '')
32 self.allowed_domains = list(filter(None, domain.split(',')))
33 super(BookSpider, self).__init__(*args, **kwargs)
34 # ----5 修改配置文件
35 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
36 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
37 SCHEDULER_PERSIST = True
38 REDIS_URL = redis://:[password]@host:port
39
40 # 注:运行时要加上domain指定的域