爬虫基础8(框架Scrapy中起始url内部实现)

框架Scrapy中起始url内部实现

源码解析

位置

class KuaidailiSpider(scrapy.Spider):

源码:

    def start_requests(self):
        cls = self.__class__
        if method_is_overridden(cls, Spider, 'make_requests_from_url'):
            warnings.warn(
                "Spider.make_requests_from_url method is deprecated; it "
                "won't be called in future Scrapy releases. Please "
                "override Spider.start_requests method instead (see %s.%s)." % (
                    cls.__module__, cls.__name__
                ),
            )
            for url in self.start_urls:
                yield self.make_requests_from_url(url)
        else:
            for url in self.start_urls:
                yield Request(url, dont_filter=True)

自定制起始url

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request

"""
scrapy引擎来爬虫中取起始url:
    1、调用start_requests并获取返回值
    2、v = iter(返回值)
    3、
        执行 req1 = v.__next__()
        执行 req2 = v.__next__()
        执行 req3 = v.__next__()
        ...
    4、req全部放到调度器中
"""
class KuaidailiSpider(scrapy.Spider):
    name = 'kuaidaili'
    allowed_domains = ['https://www.kuaidaili.com/free/']
    start_urls = ['https://www.kuaidaili.com/free/']
    cookie_dict = {}

    def start_requests(self):
        # 定义起始url方式一:[也可以发post请求]yield Request(url=url,method='post')
        for url in self.start_urls:
            yield Request(url=url)
        # 定义起始url方式二:
        # req_list = []
        # for url in self.start_urls:
        #     req_list.append(Request(url=url))
        # return req_list

 

posted @ 2018-07-04 20:08  争-渡  阅读(386)  评论(0)    收藏  举报