<5> 分布式实现

 1 """
 2     配置 redis
 3     安装 pip3 install scrapy-redis
 4     修改scrapy项目(先正常实现scrapy爬虫):
 5 """
 6 
 7 # ----1 导入分布式爬虫类
 8 from scrapy_ redis.spiders import RedisSpider, RedisCrawlSpider
 9 # ----2 继承分布式爬虫类
10 class BookSpider(RedisSpider):
11 # ----3. 初始的 start_urls 改为 redis_key。
12     redis_key = 'key'
13 # ----4. settings中 修改调度器类和去重类
14     SCHEDULER = "scrapy_redis.scheduler.Scheduler"
15     DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
16     SCHEDULER_PERSIST = True
17 # ----5. 在settings.py文件中配置Redis
18     REDIS_URL = redis://:[password]@host:port
19          
20  -----------------------------------------------------------------------
21 # ----1 导入分布式爬虫类
22 from scrapy_ redis.spiders import RedisSpider, RedisCrawlSpider
23 # ----2 继承分布式爬虫类
24 class BookSpider(RedisSpider):
25 
26 # ----3 注销start_url & allowed_domains
27 # ----4 设置redis_key
28     redis_key = 'key'
29 # ----5 设置__init__
30     def __init__(self, *args, **kwargs):
31         domain = kwargs.pop('domain', '')
32         self.allowed_domains = list(filter(None, domain.split(',')))
33         super(BookSpider, self).__init__(*args, **kwargs)
34 # ----5 修改配置文件
35     SCHEDULER = "scrapy_redis.scheduler.Scheduler"
36     DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
37     SCHEDULER_PERSIST = True
38     REDIS_URL = redis://:[password]@host:port
39                 
40 # 注：运行时要加上domain指定的域
posted on 2022-11-07 20:14 不是霉蛋阅读(40) 评论(0) 收藏举报