1 # 安装
2 pip3 install scrapy_redis
3 # 源码
4 https://github.com/rmax/scrapy-redis.git
5 # 文档
6 https://github.com/rmax/scrapy-redis
7
8 # 配置说明: https://github.com/rmax/scrapy-redis/wiki/Usage
9 REDIS_HOST = 'localhost'
10 REDIS_PORT = 6379
11 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
12 SCHEDULER = "scrapy_redis.scheduler.Scheduler"
13 # 可暂停Spider
14 SCHEDULER_PERSIST = True
15 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
16 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
17 # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
18
19 ITEM_PIPELINES = {
20 'scrapy_redis.pipelines.RedisPipeline': 400,
21 }
22
23 # Spider
24 from scrapy_redis.spiders import RedisSpider
25
26 class BaiduSpider(RedisSpider):
27 """Spider that reads urls from redis queue (myspider:start_urls)."""
28 name = 'baiu'
29 redis_key = 'myspider:baiu'
30 # allowed_domains = ['baiu.com']
31
32 def __init__(self, *args, **kwargs):
33 # Dynamically define the allowed domains list.
34 domain = kwargs.pop('domain', '')
35 self.allowed_domains = filter(None, domain.split(','))
36 super(BaiduSpider, self).__init__(*args, **kwargs)
37
38 def parse(self, response):
39 print(response.text)
40 # return {
41 # 'name': response.css('title::text').extract_first(),
42 # 'url': response.url,
43 # }
44
45 # CrawlSpider
46 from scrapy.linkextractors import LinkExtractor
47 from scrapy.spiders import CrawlSpider, Rule
48 from scrapy_redis.spiders import RedisCrawlSpider
49
50
51 class FanqienovelSpider(RedisCrawlSpider):
52 name = 'fanqienovel'
53 redis_key = 'mycrawler:fanqienovel'
54 # allowed_domains = ['baiu.com']
55
56 rules = (
57 # follow all links
58 Rule(LinkExtractor(), callback='parse_page', follow=True),
59 )
60
61 def __init__(self, *args, **kwargs):
62 # Dynamically define the allowed domains list.
63 domain = kwargs.pop('domain', '')
64 self.allowed_domains = filter(None, domain.split(','))
65 super(FanqienovelSpider, self).__init__(*args, **kwargs)
66
67 def parse_page(self, response):
68 print(response.text)
69 return {
70 'name': response.css('title::text').extract_first(),
71 'url': response.url,
72 }