当运行scrapy crawl spider 时,会生成一个crawl命令对象,scrapy是调用execute函数(cmdlin.py)来执行命令的,execute函数会给命令对象添加crawler_process属性(cmd.crawler_process = CrawlerProcess(settings)),CrawlerProcess调用crawle时会调用到spider的crawl方法。
def _create_crawler(self, spidercls):
if isinstance(spidercls, six.string_types):
spidercls = self.spider_loader.load(spidercls)
return Crawler(spidercls, self.settings)#生成一个爬虫
class Crawler(object):#爬虫类
def __init__(self, spidercls, settings=None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.spidercls = spidercls#爬虫的蜘蛛类
self.settings = settings.copy()
self.spidercls.update_settings(self.settings)
self.signals = SignalManager(self)
self.stats = load_object(self.settings['STATS_CLASS'])(self)
handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
logging.root.addHandler(handler)
if get_scrapy_root_handler() is not None:
# scrapy root handler alread installed: update it with new settings
install_scrapy_root_handler(self.settings)
# lambda is assigned to Crawler attribute because this way it is not
# garbage collected after leaving __init__ scope
self.__remove_handler = lambda: logging.root.removeHandler(handler)
self.signals.connect(self.__remove_handler, signals.engine_stopped)
lf_cls = load_object(self.settings['LOG_FORMATTER'])
self.logformatter = lf_cls.from_crawler(self)
self.extensions = ExtensionManager.from_crawler(self)
self.settings.freeze()
self.crawling = False#该爬虫是否在爬行
self.spider = None
self.engine = None#引擎,对蜘蛛、schudler、download的控制
@property
def spiders(self):
if not hasattr(self, '_spiders'):
warnings.warn("Crawler.spiders is deprecated, use "
"CrawlerRunner.spider_loader or instantiate "
"scrapy.spiderloader.SpiderLoader with your "
"settings.",
category=ScrapyDeprecationWarning, stacklevel=2)
self._spiders = _get_spider_loader(self.settings.frozencopy())
return self._spiders
@defer.inlineCallbacks
def crawl(self, *args, **kwargs):#调用爬虫的crawl方法时,会创建蜘蛛和引擎
assert not self.crawling, "Crawling already taking place"
self.crawling = True
try:
self.spider = self._create_spider(*args, **kwargs)
self.engine = self._create_engine()
start_requests = iter(self.spider.start_requests())
yield self.engine.open_spider(self.spider, start_requests)#引擎的打开蜘蛛
yield defer.maybeDeferred(self.engine.start)
except Exception:
# In Python 2 reraising an exception after yield discards
# the original traceback (see http://bugs.python.org/issue7563),
# so sys.exc_info() workaround is used.
# This workaround also works in Python 3, but it is not needed,
# and it is slower, so in Python 3 we use native `raise`.
if six.PY2:
exc_info = sys.exc_info()
self.crawling = False
if self.engine is not None:
yield self.engine.close()
if six.PY2:
six.reraise(*exc_info)
raise
scrapy/core/engine.py
def _next_request_from_scheduler(self, spider): slot = self.slot request = slot.scheduler.next_request()#从调度器中取出一个请求 if not request: return d = self._download(request, spider)#去下载,生成一个defer对象 d.addBoth(self._handle_downloader_output, request, spider) d.addErrback(lambda f: logger.info('Error while handling downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.remove_request(request)) d.addErrback(lambda f: logger.info('Error while removing request from slot', exc_info=failure_to_exc_info(f), extra={'spider': spider})) d.addBoth(lambda _: slot.nextcall.schedule()) d.addErrback(lambda f: logger.info('Error while scheduling new request', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
def _handle_downloader_output(self, response, request, spider): assert isinstance(response, (Request, Response, Failure)), response # downloader middleware can return requests (for example, redirects) if isinstance(response, Request):#如果返回的是请求,调用crawl,产生一个请求。 self.crawl(response, spider) return # response is a Response or Failure d = self.scraper.enqueue_scrape(response, request, spider)#由scraper处理一个爬行结果 d.addErrback(lambda f: logger.error('Error while enqueuing downloader output', exc_info=failure_to_exc_info(f), extra={'spider': spider})) return d
scrapy主要是由inlinecallback实现的。
core/scraper.py
def _scrape2(self, request_result, request, spider): """Handle the different cases of request's result been a Response or a Failure""" if not isinstance(request_result, Failure): return self.spidermw.scrape_response( self.call_spider, request_result, request, spider) else: # FIXME: don't ignore errors in spider middleware dfd = self.call_spider(request_result, request, spider)#爬行成功,调用call return dfd.addErrback( self._log_download_errors, request_result, request, spider) def call_spider(self, result, request, spider): result.request = request dfd = defer_result(result) dfd.addCallbacks(request.callback or spider.parse, request.errback)#调用请求的回调函数或蜘蛛的parse return dfd.addCallback(iterate_spider_output)
浙公网安备 33010602011771号