Scrapy爬虫框架
一、安装Scrapy框架
Linux/Mac:直接pip3 install scrapy
windows:1、pip3 install wheel 2、安装对应python版本和位数的Twisted的whl包 点击下载 3、安装pywin32 点击下载 4、pip3 install scrapy
二、Scrapy基本命令
1、创建Scrapy工程
scrapy startproject 工程名 #该命令会在当前目录下创建与工程名同名的目录
2、创建爬虫应用
进入创建的工程目录,执行如下命令
scrapy genspider 应用名称 爬取网站的域名 #该命令会在工程目录下,与工程名同名的目录中的spiders目录下创建与应用名称同名的爬虫应用文件
3、运行爬虫
在工程目录下执行如下命令
scrapy crawl 爬虫应用名 --nolog #nolog参数表示不显示爬虫的日志信息
注:在Windows下运行爬虫出现编码错误只需在应用最上面写上如下代码
import os,sys
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
三、爬虫目录结构
工程目录下:
-sp1
-spiders #爬虫文件目录
-middlewares.py #中间件
-items.py #数据格式化
-piplines.py #数据持久化
-settings.py #详细配置文件
-scrapy.cfg #全局配置文件
四、settings.py的配置
ROBOTSTXT_OBEY=Fals #表示是否遵循爬虫协议,即网站robots.txt中的设置
DEPTH_LIMIT=1 #表示爬虫的最大递归深度,建议设置在5以内
from scrapy.dupefilter import RFPDupeFilter
DUPEFILTER_CLASS='工程名.去重文件名.去重类名' #使用自定义的去重规则
EXTENSIONS={
'工程名.扩展文件名.扩展类名':执行优先级
} #使用自定义扩展
ITEM_PIPELINES = {
'sp1.pipelines.Sp1Pipeline': 300, #持久化配置
}
SPIDER_MIDDLEWARE={路径:优先级} #爬虫中间件
DOWNLOADER_MIDDLEWARE={} #下载中间件
COMMANDS_MODULE = '项目名称.目录名称' #自定义命令
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" #HTTPS访问配置
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" #实现HTTPS访问的类路径
五、Scrapy选择标签的规则
from scrapy.selector import Selector
hxs=Selector(reponse=response) #获取document对象
result=hxs.xpath('选择规则') #返回所有匹配的标签对象,可以循环该结果,对每个标签对象进行单独操作
ele=resule.xpath().xpath()
可使用的规则如下:
1、标签名
2、标签名[索引]
3、标签名[@属性名="属性值"][@属性名="属性值"]
4、标签名[contains(@属性,"值")] #指定属性的值当中需要包含指定的值
5、标签名[starts-with(@属性名,"值")] #指定属性的值需要以指定的值开头
6、标签名[re:test(@属性,"正则规则")] #指定属性的值需要满足正则规则
7、标签名[re:test(@属性,"正则规则")/text()] #获取标签的文本
8、标签名[re:test(@属性,"正则规则")/@属性] #获取标签指定属性的值
9、标签名/标签名/a/@href #逐级查找子标签,最后获取a标签的href属性值
10、规则中的第一个斜杠如果为//标签在整个HTML中查找,./表示在整个当前标签对象中查找,不写斜杠表示在当前标签对象的子标签中查找;第二个斜杠开始,//表示在整个当前标签对象中查找,/表示在当前标签对象的子标签中查找
11、xpath可以链式使用
12、标签对象.extract() #返回列表形式
13、标签对象.extract_first() #返回匹配的第一个标签
四、自定义起始URL函数
'''
爬虫应用通过其类中的start_requests()方法来将起始URL和对应的回调函数封装成request对象放入调度器执行,可以通过重写这个方法来指定回调函数,并同时链式调用多个回调函数
'''
import scrapy
from scrapy.http import Request
class ChoutiSpider(scrapy.Spider):
name = 'chouti' #通过name属性定位一个爬虫应用
allowed_domains = ['chouti.com'] #爬虫爬取内容的网站域名,避免爬取其他网站的内容
start_urls = ['http://chouti.com/'] #爬虫的起始URL
def start_requests(self):
'''
循环将url和回调函数封装成request对象放入调度器
:return:
'''
for url in self.start_urls:
yield Request(url,dont_filter=True,callback=self.parse1) #yield Requst() 将请求放入调度器中执行
def parse1(self, response):
yield Request(....,callback=parse2)
def parse2(self,response):
pass
五、Scrapy发送POST请求
from scrapy.http import Request
class ChoutiSpider(scrapy.Spider):
def parse(self,response):
yield Request(
url='', #请求的链接
method='POST', #指定请求方式
headers={'Content-Type':'...,charset=UTF-8'}, #请求头信息
cookies={}, #cookie
body='', #urlencoded形式的数据
callback=, #回调函数
)
'''
通过urllib.parse模块将字典数据转化为urlencoded形式的数据
import urllib.parse
data=urllib.parse.urlencode({})
'''
'''
通过CookieJar模块提取response中的cookie信息
from scrapy.http.cookies import CookieJar
cookie_jar=CookieJar()
cookie_jar.extract_cookies(response,response.request) cookie信息在cookiejar对象中
cookie_dict={}
for k,v in cookie_jar._cookies.items():
for i,j in v.items():
for m,n in j.items():
cookie_dict[m]=n.value 将cookie信息转化为字典形式
'''
六、Scrapy持久化
#应用的parse方法中写如下代码
from items.py文件路径 import item类名
yield item类名(字段名1=值1,......) 格式化数据,并将item传给pipline调用,其中的字段是在item类中定义的字段,不能多不能少
#在items.py文件的类中写如下代码
class Sp1Item(scrapy.Item):
#规定字段数量
url=scrapy.Field()
text=scrapy.Field()
#在piplines.py文件的类中写入如下代码,可以定义多个pipline类,根据配置文件中的优先级决定执行顺序,除process_item方法外,其他方法都只执行一次
class Sp3Pipeline(object):
def __init__(self):
self.f = None
def process_item(self, item, spider):
"""
爬虫执行期间不断调用
:param item: 爬虫中yield回来的对象
:param spider: 爬虫对象 obj = JianDanSpider()
:return:
"""
if spider.name='': #针对某个爬虫执行一些操作
pass
print(item)
self.f.write('....')
return item #传递item给下一个pipeline的process_item
raise DropItem() #不在执行后面的process_item
@classmethod
def from_crawler(cls, crawler):
"""
初始化时候,用于创建pipeline对象,如果没有该方法就直接执行__init__方法创建pipeline对象
:param crawler:包含整个爬虫的所有信息
:return:
"""
# val = crawler.settings.get('MMMM')
print('执行pipeline的from_crawler,进行实例化对象')
return cls()
def open_spider(self, spider):
"""
爬虫开始执行时,调用
:param spider:
:return:
"""
print('打开爬虫')
self.f = open('a.log', 'a+')
def close_spider(self, spider):
"""
爬虫关闭时,被调用
:param spider:
:return:
"""
self.f.close()
七、自定义去重规则
#在工程目录的同名目录下创建去重规则文件,写入如下代码,可以写多个去重规则类,根据配置文件中的优先级决定执行顺序
class RepeatUrl:
def __init__(self):
self.visited_url = set() # 放在当前服务的内存
@classmethod
def from_settings(cls, settings):
"""
初始化时,调用,可以获取配置文件中的参数,如果没有该方法直接执行__init__
:param settings:
:return:
"""
return cls()
def request_seen(self, request):
"""
检测当前请求是否已经被访问过
:param request:
:return: True表示已经访问过;False表示未访问过
"""
if request.url in self.visited_url:
return True #request_seen方法返回True,该URL不再放入调度器执行
self.visited_url.add(request.url)
return False #添加到调度器中
def open(self):
"""
开始爬去请求时,调用
:return:
"""
print('open replication')
def close(self, reason):
"""
结束爬虫爬取时,调用
:param reason:
:return:
"""
print('close replication')
def log(self, request, spider):
"""
记录日志
:param request:
:param spider:
:return:
"""
print('repeat', request.url)
八、基于Scrapy信号自定义扩展
#在工程目录的同名目录下创建自定义扩展文件,可以写多个类
from scrapy import signals
class MyExtension(object):
def __init__(self, value):
self.value = value
@classmethod
def from_crawler(cls, crawler):
val = crawler.settings.getint('MMMM')
ext = cls(val)
# 在scrapy中注册信号: spider_opened
#(信号触发时执行的函数,触发的时机)
crawler.signals.connect(ext.opened, signal=signals.spider_opened)
# 在scrapy中注册信号: spider_closed
crawler.signals.connect(ext.closed, signal=signals.spider_closed)
return ext
def opened(self, spider):
print('open')
def closed(self, spider):
print('close')
'''
Scrapy内置信号
engine_started 引擎开始
engine_stopped 引擎结束
spider_opened 爬虫开始
spider_idle 爬虫闲置
spider_closed 爬虫结束
spider_error 爬虫出错
request_scheduled 调度器开始
request_dropped
response_received 拿到返回值
response_downloaded 下载开始
item_scraped 调用持久化时
item_dropped 不执行后续process_items方法时
'''
九、中间件
#爬虫中间件
from scrapy import signals
class SpiderMiddleware(object):
def process_spider_input(self,response, spider):
"""
下载完成,执行,然后交给parse处理
"""
pass
def process_spider_output(self,response, result, spider):
"""
spider处理完成,返回给引擎时调用
:return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable)
"""
return result
def process_spider_exception(self,response, exception, spider):
"""
异常调用
:return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline
"""
return None
def process_start_requests(self,start_requests, spider):
"""
爬虫启动时调用
:return: 包含 Request 对象的可迭代对象
"""
return start_requests
#下载中间件
class DownMiddleware1(object):
def process_request(self, request, spider):
"""
请求需要被下载时,经过所有下载器中间件的process_request调用,自定义下载器和设置代理
None,继续后续中间件去下载;
Response对象,停止process_request的执行,开始执行process_response
Request对象,停止中间件的执行,将Request重新调度器
raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
"""
"""
from scrapy.http import Request
# request.method = "POST"
request.headers['proxy'] = "{'ip_port': '111.11.228.75:80', 'user_pass': ''},"
return None
"""
"""
from scrapy.http import Response
import requests
v = request.get('http://www.baidu.com')
data = Response(url='xxxxxxxx',body=v.content,request=request)
return data
"""
def process_response(self, request, response, spider):
"""
spider处理完成,返回时调用,对response对象做处理
Response 对象:转交给其他中间件process_response
Request 对象:停止中间件,request会被重新调度下载
raise IgnoreRequest 异常:调用Request.errback
"""
print('response1')
# from scrapy.http import Response
# response.encoding = 'utf-8'
return response
def process_exception(self, request, exception, spider):
"""
当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
None:继续交给后续中间件处理异常;
Response对象:停止后续process_exception方法
Request对象:停止中间件,request将会被重新调用下载
"""
return None
#自定义下载中间件实现代理
def to_bytes(text, encoding=None, errors='strict'):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors)
class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************ProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
十、自定义命令
'''
在工程目录下的同名目录创建目录,在该目录下创建Python文件,文件名就是命令的名称,文件中写如下代码即可实现同时运行所有爬虫的命令
'''
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class Command(ScrapyCommand):
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
from scrapy.crawler import CrawlerProcess
from scrapy.core.engine import ExecutionEngine
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
import os
# os.environ['http_proxy'] = "http://root:woshiniba@192.168.11.11:9999/"
# os.environ['https_proxy'] = "http://192.168.11.11:9999/"
# os.environ['xx_proxy'] = "http://192.168.11.11:9999/"
# 爬虫列表
spider_list = self.crawler_process.spiders.list()
for name in spider_list:
# 初始化爬虫
self.crawler_process.crawl(name, **opts.__dict__)
# 开始执行所有的爬虫
self.crawler_process.start()
十一、Scrap实现HTTPS访问
#使用默认的,在配置文件中加入如下配置
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
#自定义
#配置文件
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
#创建文件
# https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
十二、TinyScrapy
#!/usr/bin/env python # -*- coding:utf-8 -*- import types from twisted.internet import defer from twisted.web.client import getPage from twisted.internet import reactor class Request(object): def __init__(self, url, callback): self.url = url self.callback = callback self.priority = 0 class HttpResponse(object): def __init__(self, content, request): self.content = content self.request = request class ChouTiSpider(object): def start_requests(self): url_list = ['http://www.cnblogs.com/', 'http://www.bing.com'] for url in url_list: yield Request(url=url, callback=self.parse) def parse(self, response): print(response.request.url) # yield Request(url="http://www.baidu.com", callback=self.parse) from queue import Queue Q = Queue() class CallLaterOnce(object): def __init__(self, func, *a, **kw): self._func = func self._a = a self._kw = kw self._call = None def schedule(self, delay=0): if self._call is None: self._call = reactor.callLater(delay, self) def cancel(self): if self._call: self._call.cancel() def __call__(self): self._call = None return self._func(*self._a, **self._kw) class Engine(object): def __init__(self): self.nextcall = None self.crawlling = [] self.max = 5 self._closewait = None def get_response(self,content, request): response = HttpResponse(content, request) gen = request.callback(response) if isinstance(gen, types.GeneratorType): for req in gen: req.priority = request.priority + 1 Q.put(req) def rm_crawlling(self,response,d): self.crawlling.remove(d) def _next_request(self,spider): if Q.qsize() == 0 and len(self.crawlling) == 0: self._closewait.callback(None) if len(self.crawlling) >= 5: return while len(self.crawlling) < 5: try: req = Q.get(block=False) except Exception as e: req = None if not req: return d = getPage(req.url.encode('utf-8')) self.crawlling.append(d) d.addCallback(self.get_response, req) d.addCallback(self.rm_crawlling,d) d.addCallback(lambda _: self.nextcall.schedule()) @defer.inlineCallbacks def crawl(self): spider = ChouTiSpider() start_requests = iter(spider.start_requests()) flag = True while flag: try: req = next(start_requests) Q.put(req) except StopIteration as e: flag = False self.nextcall = CallLaterOnce(self._next_request,spider) self.nextcall.schedule() self._closewait = defer.Deferred() yield self._closewait @defer.inlineCallbacks def pp(self): yield self.crawl() _active = set() obj = Engine() d = obj.crawl() _active.add(d) li = defer.DeferredList(_active) li.addBoth(lambda _,*a,**kw: reactor.stop()) reactor.run()
零、配置文件详细
# -*- coding: utf-8 -*- # Scrapy settings for step8_king project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html # 1. 爬虫名称 BOT_NAME = 'step8_king' # 2. 爬虫应用路径 SPIDER_MODULES = ['step8_king.spiders'] NEWSPIDER_MODULE = 'step8_king.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent # 3. 客户端 user-agent请求头 # USER_AGENT = 'step8_king (+http://www.yourdomain.com)' # Obey robots.txt rules # 4. 禁止爬虫配置 # ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) # 5. 并发请求数 # CONCURRENT_REQUESTS = 4 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # 6. 延迟下载秒数 # DOWNLOAD_DELAY = 2 # The download delay setting will honor only one of: # 7. 单域名访问并发数,并且延迟下次秒数也应用在每个域名 # CONCURRENT_REQUESTS_PER_DOMAIN = 2 # 单IP访问并发数,如果有值则忽略:CONCURRENT_REQUESTS_PER_DOMAIN,并且延迟下次秒数也应用在每个IP # CONCURRENT_REQUESTS_PER_IP = 3 # Disable cookies (enabled by default) # 8. 是否支持cookie,cookiejar进行操作cookie # COOKIES_ENABLED = True # COOKIES_DEBUG = True # Disable Telnet Console (enabled by default) # 9. Telnet用于查看当前爬虫的信息,操作爬虫等... # 使用telnet ip port ,然后通过命令操作 # TELNETCONSOLE_ENABLED = True # TELNETCONSOLE_HOST = '127.0.0.1' # TELNETCONSOLE_PORT = [6023,] # 10. 默认请求头 # Override the default request headers: # DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', # } # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html # 11. 定义pipeline处理请求 # ITEM_PIPELINES = { # 'step8_king.pipelines.JsonPipeline': 700, # 'step8_king.pipelines.FilePipeline': 500, # } # 12. 自定义扩展,基于信号进行调用 # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html # EXTENSIONS = { # # 'step8_king.extensions.MyExtension': 500, # } # 13. 爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度 # DEPTH_LIMIT = 3 # 14. 爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo # 后进先出,深度优先 # DEPTH_PRIORITY = 0 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue' # 先进先出,广度优先 # DEPTH_PRIORITY = 1 # SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue' # SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue' # 15. 调度器队列 # SCHEDULER = 'scrapy.core.scheduler.Scheduler' # from scrapy.core.scheduler import Scheduler # 16. 访问URL去重 # DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl' # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html """ 17. 自动限速算法 from scrapy.contrib.throttle import AutoThrottle 自动限速设置 1. 获取最小延迟 DOWNLOAD_DELAY 2. 获取最大延迟 AUTOTHROTTLE_MAX_DELAY 3. 设置初始下载延迟 AUTOTHROTTLE_START_DELAY 4. 当请求下载完成后,获取其"连接"时间 latency,即:请求连接到接受到响应头之间的时间 5. 用于计算的... AUTOTHROTTLE_TARGET_CONCURRENCY target_delay = latency / self.target_concurrency new_delay = (slot.delay + target_delay) / 2.0 # 表示上一次的延迟时间 new_delay = max(target_delay, new_delay) new_delay = min(max(self.mindelay, new_delay), self.maxdelay) slot.delay = new_delay """ # 开始自动限速 # AUTOTHROTTLE_ENABLED = True # The initial download delay # 初始下载延迟 # AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies # 最大下载延迟 # AUTOTHROTTLE_MAX_DELAY = 10 # The average number of requests Scrapy should be sending in parallel to each remote server # 平均每秒并发数 # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: # 是否显示 # AUTOTHROTTLE_DEBUG = True # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings """ 18. 启用缓存 目的用于将已经发送的请求或相应缓存下来,以便以后使用 from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware from scrapy.extensions.httpcache import DummyPolicy from scrapy.extensions.httpcache import FilesystemCacheStorage """ # 是否启用缓存策略 # HTTPCACHE_ENABLED = True # 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy" # 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略 # HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy" # 缓存超时时间 # HTTPCACHE_EXPIRATION_SECS = 0 # 缓存保存路径 # HTTPCACHE_DIR = 'httpcache' # 缓存忽略的Http状态码 # HTTPCACHE_IGNORE_HTTP_CODES = [] # 缓存存储的插件 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' """ 19. 代理,需要在环境变量中设置 from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware 方式一:使用默认 os.environ { http_proxy:http://root:woshiniba@192.168.11.11:9999/ https_proxy:http://192.168.11.11:9999/ } 方式二:使用自定义下载中间件 def to_bytes(text, encoding=None, errors='strict'): if isinstance(text, bytes): return text if not isinstance(text, six.string_types): raise TypeError('to_bytes must receive a unicode, str or bytes ' 'object, got %s' % type(text).__name__) if encoding is None: encoding = 'utf-8' return text.encode(encoding, errors) class ProxyMiddleware(object): def process_request(self, request, spider): PROXIES = [ {'ip_port': '111.11.228.75:80', 'user_pass': ''}, {'ip_port': '120.198.243.22:80', 'user_pass': ''}, {'ip_port': '111.8.60.9:8123', 'user_pass': ''}, {'ip_port': '101.71.27.120:80', 'user_pass': ''}, {'ip_port': '122.96.59.104:80', 'user_pass': ''}, {'ip_port': '122.224.249.122:8088', 'user_pass': ''}, ] proxy = random.choice(PROXIES) if proxy['user_pass'] is not None: request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass'])) request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass) print "**************ProxyMiddleware have pass************" + proxy['ip_port'] else: print "**************ProxyMiddleware no pass************" + proxy['ip_port'] request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port']) DOWNLOADER_MIDDLEWARES = { 'step8_king.middlewares.ProxyMiddleware': 500, } """ """ 20. Https访问 Https访问时有两种情况: 1. 要爬取网站使用的可信任证书(默认支持) DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory" 2. 要爬取网站使用的自定义证书 DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory" DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory" # https.py from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate) class MySSLFactory(ScrapyClientContextFactory): def getCertificateOptions(self): from OpenSSL import crypto v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read()) v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read()) return CertificateOptions( privateKey=v1, # pKey对象 certificate=v2, # X509对象 verify=False, method=getattr(self, 'method', getattr(self, '_ssl_method', None)) ) 其他: 相关类 scrapy.core.downloader.handlers.http.HttpDownloadHandler scrapy.core.downloader.webclient.ScrapyHTTPClientFactory scrapy.core.downloader.contextfactory.ScrapyClientContextFactory 相关配置 DOWNLOADER_HTTPCLIENTFACTORY DOWNLOADER_CLIENTCONTEXTFACTORY """ """ 21. 爬虫中间件 class SpiderMiddleware(object): def process_spider_input(self,response, spider): ''' 下载完成,执行,然后交给parse处理 :param response: :param spider: :return: ''' pass def process_spider_output(self,response, result, spider): ''' spider处理完成,返回时调用 :param response: :param result: :param spider: :return: 必须返回包含 Request 或 Item 对象的可迭代对象(iterable) ''' return result def process_spider_exception(self,response, exception, spider): ''' 异常调用 :param response: :param exception: :param spider: :return: None,继续交给后续中间件处理异常;含 Response 或 Item 的可迭代对象(iterable),交给调度器或pipeline ''' return None def process_start_requests(self,start_requests, spider): ''' 爬虫启动时调用 :param start_requests: :param spider: :return: 包含 Request 对象的可迭代对象 ''' return start_requests 内置爬虫中间件: 'scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware': 50, 'scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware': 500, 'scrapy.contrib.spidermiddleware.referer.RefererMiddleware': 700, 'scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware': 800, 'scrapy.contrib.spidermiddleware.depth.DepthMiddleware': 900, """ # from scrapy.contrib.spidermiddleware.referer import RefererMiddleware # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html SPIDER_MIDDLEWARES = { # 'step8_king.middlewares.SpiderMiddleware': 543, } """ 22. 下载中间件 class DownMiddleware1(object): def process_request(self, request, spider): ''' 请求需要被下载时,经过所有下载器中间件的process_request调用 :param request: :param spider: :return: None,继续后续中间件去下载; Response对象,停止process_request的执行,开始执行process_response Request对象,停止中间件的执行,将Request重新调度器 raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception ''' pass def process_response(self, request, response, spider): ''' spider处理完成,返回时调用 :param response: :param result: :param spider: :return: Response 对象:转交给其他中间件process_response Request 对象:停止中间件,request会被重新调度下载 raise IgnoreRequest 异常:调用Request.errback ''' print('response1') return response def process_exception(self, request, exception, spider): ''' 当下载处理器(download handler)或 process_request() (下载中间件)抛出异常 :param response: :param exception: :param spider: :return: None:继续交给后续中间件处理异常; Response对象:停止后续process_exception方法 Request对象:停止中间件,request将会被重新调用下载 ''' return None 默认下载中间件 { 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100, 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300, 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350, 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400, 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500, 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550, 'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580, 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590, 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600, 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700, 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750, 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830, 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850, 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900, } """ # from scrapy.contrib.downloadermiddleware.httpauth import HttpAuthMiddleware # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # DOWNLOADER_MIDDLEWARES = { # 'step8_king.middlewares.DownMiddleware1': 100, # 'step8_king.middlewares.DownMiddleware2': 500, # }

浙公网安备 33010602011771号