1 # scheduler的作用: 用于控制Request对象的存储和获取,并提供了过滤重复Request的功能。
2
3 class Scheduler(object):
4
5 def __init__(self, dupefilter, jobdir=None, dqclass=None, mqclass=None,
6 logunser=False, stats=None, pqclass=None):
7 self.df = dupefilter
8 self.dqdir = self._dqdir(jobdir)
9 self.pqclass = pqclass # 优先级队列
10 self.dqclass = dqclass # 磁盘序列号队列,用于断点续采
11 self.mqclass = mqclass # 内存队列
12 self.logunser = logunser
13 self.stats = stats
14
15 @classmethod
16 def from_crawler(cls, crawler):
17 settings = crawler.settings
18 dupefilter_cls = load_object(settings['DUPEFILTER_CLASS'])
19 dupefilter = dupefilter_cls.from_settings(settings)
20 pqclass = load_object(settings['SCHEDULER_PRIORITY_QUEUE'])
21 dqclass = load_object(settings['SCHEDULER_DISK_QUEUE'])
22 mqclass = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
23 logunser = settings.getbool('LOG_UNSERIALIZABLE_REQUESTS', settings.getbool('SCHEDULER_DEBUG'))
24 return cls(dupefilter, jobdir=job_dir(settings), logunser=logunser,
25 stats=crawler.stats, pqclass=pqclass, dqclass=dqclass, mqclass=mqclass)