Scrapy源码剖析
草图:
1. 执行命令
scrapy custom --nolog
2. 触发执行/usr/local/bin/scrapy.py文件
# -*- coding: utf-8 -*- import re import sys from scrapy.cmdline import execute if __name__ == '__main__': sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) sys.exit(execute())
import sys from scrapy.cmdline import execute if __name__ == '__main__': sys.argv = ["scrapy","github","--nolog"] sys.exit(execute())
3.执行execute方法
此过程主要完
- 获取配置文件
- 获取当前要执行的命令对象,如:view、commans、fetch、自定义等...
- 创建CrawlerProcess对象用于爬取数据
命令对象.crawler_process = CrawlerProcess(settings) - 开始执行爬虫:命令对象.run()
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # ############ 1. 加载配置文件 ############ # 使用 scrapy.conf.settings对象作为配置文件 if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # 使用当前目录下的 scrapy.cfg 文件作为配置文件 # 读取scrapy.cfg中的[settings]节点,并加载spider1.settings中的所有配置内容 # [settings] # default = spider1.settings if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # ############ 2. 检验当前用户输入命令是否合法:如 scrapy fetch ... ############ inproject = inside_project() # ##### 获取scrapy支持的所有命令:【内置命令】【自定义命令】 # 【自定义命令】 # 读取配置文件中字段 COMMANDS_MODULE = 'spider1.cmds' 添加到自定义命令中 cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) # ############ 3. 创建命令对象 ############ cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) # ############ 4. 创建用于真正用于爬取数据的CrawlerProcess对象 ############ cmd.crawler_process = CrawlerProcess(settings) # ############ 5. 调用命令对象的run方法,开始爬虫 ############ _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode) def _run_command(cmd, args, opts): if opts.profile: _run_command_profiled(cmd, args, opts) else: cmd.run(args, opts) def _run_print_help(parser, func, *a, **kw): try: func(*a, **kw) except UsageError as e: if str(e): parser.error(str(e)) if e.print_help: parser.print_help() sys.exit(2)
def install_shutdown_handlers(function, override_sigint=True): reactor._handleSignals() # 注册信号:kill -15 时触发指定的函数 signal.signal(signal.SIGTERM, function) # 注册信号:kill -2 时触发指定的函数,Ctrl-C if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint: signal.signal(signal.SIGINT, function) # Catch Ctrl-Break in windows if hasattr(signal, "SIGBREAK"): signal.signal(signal.SIGBREAK, function) class CrawlerProcess(CrawlerRunner): def __init__(self, settings=None): super(CrawlerProcess, self).__init__(settings) # 注册信号,当程序终止被 kill或Ctrl-C时 # 终止正在执行的爬虫 # 终止twisted的事件循环 install_shutdown_handlers(self._signal_shutdown) configure_logging(self.settings) log_scrapy_info(self.settings) def _signal_shutdown(self, signum, _): install_shutdown_handlers(self._signal_kill) signame = signal_names[signum] logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",{'signame': signame}) reactor.callFromThread(self._graceful_stop_reactor) def _signal_kill(self, signum, _): install_shutdown_handlers(signal.SIG_IGN) signame = signal_names[signum] logger.info('Received %(signame)s twice, forcing unclean shutdown',{'signame': signame}) reactor.callFromThread(self._stop_reactor) def stop(self): return defer.DeferredList([c.stop() for c in list(self.crawlers)]) def _graceful_stop_reactor(self): d = self.stop() d.addBoth(self._stop_reactor) return d def _stop_reactor(self, _=None): try: reactor.stop() except RuntimeError: # raised if already stopped or in shutdown stage pass
4.命令对象
自定义命令对象,从而在scrapy命令中可执行:scrapy custom --nolog
PS:自定义命令文件所在目录需要添加到配置文件中:COMMANDS_MODULE = '自定义命令文件所在目录路径'
# custom.py class Command(ScrapyCommand): requires_project = True def syntax(self): return '[options]' def short_desc(self): return 'Runs all of the spiders' def run(self, args, opts): # #################### 执行名称为 cnblogs 的爬虫 #################### # ### self.crawler_process是CrawlerProcess对象 self.crawler_process.crawl('cnblogs', **opts.__dict__) self.crawler_process.start()
5. 基于CrawlerProcess开始爬取数据