五、Scrapy框架(5)——Request和Response对象
Request和Response对象
Request对象
1 class Request(object_ref): 2 3 def __init__(self, url, callback=None, method='GET', headers=None, body=None, 4 cookies=None, meta=None, encoding='utf-8', priority=0, 5 dont_filter=False, errback=None, flags=None): 6 7 self._encoding = encoding # this one has to be set first 8 self.method = str(method).upper() 9 self._set_url(url) 10 self._set_body(body) 11 assert isinstance(priority, int), "Request priority not an integer: %r" % priority 12 self.priority = priority 13 14 if callback is not None and not callable(callback): 15 raise TypeError('callback must be a callable, got %s' % type(callback).__name__) 16 if errback is not None and not callable(errback): 17 raise TypeError('errback must be a callable, got %s' % type(errback).__name__) 18 assert callback or not errback, "Cannot use errback without a callback" 19 self.callback = callback 20 self.errback = errback 21 22 self.cookies = cookies or {} 23 self.headers = Headers(headers or {}, encoding=encoding) 24 self.dont_filter = dont_filter 25 26 self._meta = dict(meta) if meta else None 27 self.flags = [] if flags is None else list(flags)
Request对象在我们写爬虫,爬取一面的数据需要重新发送一个请求的时候调用。这个类需要传递一些参数,其中比较常用的参数有:
- url:这个request对象发送请求的url。
- callback:在下载器下载完相应的数据后执行的回调函数。
- method:请求的方法。默认为
GET方法,可以设置为其他方法。 - headers:请求头,对于一些固定的设置,放在
settings.py中指定就可以了。对于那些非固定的,可以在发送请求的时候指定。 - meta:比较常用。用于在不同的请求之间传递数据用的。
- encoding:编码。默认的为
utf-8,使用默认的就可以了。 - dont_filter:表示不由调度器过滤。在执行多次重复的请求的时候用得比较多。
- errback:在发生错误的时候执行的函数。
Response对象
Response对象一般是由scrapy给你自动构建的。因此开发者不需要关心如何创建Response对象,而是如何使用它。Response对象有很多属性,可以用来提取数据的。主要有以下属性:
- meta:从其他请求传过来的
meta属性,可以用来保持多个请求之间的数据连接。 - encoding:返回当前字符串编码和解码的格式。
- text:将返回来的数据作为
unicode字符串返回。 - body:将返回的数据作为
bytes字符串返回。 - xpath:xpath选择器。
- css:css选择器。
发送POST请求
有时候我们想要在请求数据的时候发送post请求,那么这时候需要使用 Request的子类和FormRequest 来实现。如果想要在爬虫一开始的时候就发送 POST 请求,那么需要在爬虫类中重写 start_requests(self) 方法,并且不再调用 start_urls 里的url。
模拟登录
案例一:模拟登录人人网
- 想要发送post请求,那么推荐使用
scrapy.FormRequest方法。可以方便的指定表单数据。 - 如果想在爬虫一开始的时候就发送post请求,那么应该重写
start_requests方法。在这个方法中,发送post请求。
renren.py
# -*- coding: utf-8 -*- import scrapy class RenrenSpider(scrapy.Spider): name = 'renren' allowed_domains = ['renren.com'] start_urls = ['http://renren.com/'] def start_requests(self): url = 'http://www.renren.com/PLogin.do' data = {"email": "renjy185911222@126.com", 'password': 'caonima001'} request = scrapy.FormRequest(url, formdata=data, callback=self.parse_page) yield request def parse_page(self, response): request = scrapy.Request(url='http://www.renren.com/880151247/profile',callback=self.parse_profile) yield request def parse_profile(self,response): with open('dp.html','w',encoding='utf-8') as fp: fp.write(response.text)
settings.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for renren_login project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # https://doc.scrapy.org/en/latest/topics/settings.html 9 # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'renren_login' 13 14 SPIDER_MODULES = ['renren_login.spiders'] 15 NEWSPIDER_MODULE = 'renren_login.spiders' 16 17 # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 # USER_AGENT = 'renren_login (+http://www.yourdomain.com)' 19 20 # Obey robots.txt rules 21 ROBOTSTXT_OBEY = False 22 23 # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 # CONCURRENT_REQUESTS = 32 25 26 # Configure a delay for requests for the same website (default: 0) 27 # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 # See also autothrottle settings and docs 29 # DOWNLOAD_DELAY = 3 30 # The download delay setting will honor only one of: 31 # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 # CONCURRENT_REQUESTS_PER_IP = 16 33 34 # Disable cookies (enabled by default) 35 # COOKIES_ENABLED = False 36 37 # Disable Telnet Console (enabled by default) 38 # TELNETCONSOLE_ENABLED = False 39 40 # Override the default request headers: 41 DEFAULT_REQUEST_HEADERS = { 42 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 'Accept-Language': 'en', 44 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' 45 } 46 47 # Enable or disable spider middlewares 48 # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 # SPIDER_MIDDLEWARES = { 50 # 'renren_login.middlewares.RenrenLoginSpiderMiddleware': 543, 51 # } 52 53 # Enable or disable downloader middlewares 54 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 # DOWNLOADER_MIDDLEWARES = { 56 # 'renren_login.middlewares.RenrenLoginDownloaderMiddleware': 543, 57 # } 58 59 # Enable or disable extensions 60 # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 # EXTENSIONS = { 62 # 'scrapy.extensions.telnet.TelnetConsole': None, 63 # } 64 65 # Configure item pipelines 66 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 # ITEM_PIPELINES = { 68 # 'renren_login.pipelines.RenrenLoginPipeline': 300, 69 # } 70 71 # Enable and configure the AutoThrottle extension (disabled by default) 72 # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 # AUTOTHROTTLE_ENABLED = True 74 # The initial download delay 75 # AUTOTHROTTLE_START_DELAY = 5 76 # The maximum download delay to be set in case of high latencies 77 # AUTOTHROTTLE_MAX_DELAY = 60 78 # The average number of requests Scrapy should be sending in parallel to 79 # each remote server 80 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 # Enable showing throttling stats for every response received: 82 # AUTOTHROTTLE_DEBUG = False 83 84 # Enable and configure HTTP caching (disabled by default) 85 # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 # HTTPCACHE_ENABLED = True 87 # HTTPCACHE_EXPIRATION_SECS = 0 88 # HTTPCACHE_DIR = 'httpcache' 89 # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Start.py
from scrapy import cmdline cmdline.execute("scrapy crawl renren".split())
浙公网安备 33010602011771号