一、安装 centos7:https://www.jb51.net/article/136478.htm
a.pip3 install wheel
b.pip3 install Twisted
c.pip3 install pywin32
d.pip3 install scrapy
测试是否安装成功
C:\Users\12164>scrapy
Scrapy 1.8.0 - no active project
Usage:
scrapy <command> [options] [args]
Available commands:
bench Run quick benchmark test
fetch Fetch a URL using the Scrapy downloader
genspider Generate new spider using pre-defined templates
runspider Run a self-contained spider (without creating a project)
settings Get settings values
shell Interactive scraping console
startproject Create new project # 创建一个爬虫项目
version Print Scrapy version
view Open URL in browser, as seen by Scrapy
crawl # 运行爬虫项目,需在项目文件夹中执行
[ more ] More commands available when run from project directory
Use "scrapy <command> -h" to see more info about a command
二、创建项目
a.打开cmd,切换到想创建项目的目录scrapy startproject chouti
New Scrapy project 'chouti', using template directory 'd:\python37\lib\site-packages\scrapy\templates\project', created in:
E:\python\chouti
You can start your first spider with:
cd chouti
scrapy genspider example example.com
b.cd chouti
c.scrapy genspider get_chouti chouti.com # Cannot create a spider with the same name as your project 不能和项目名重复
Created spider 'get_chouti' using template 'basic' in module:
chouti.spiders.get_chouti
d.找到创建的name 进行编辑
chouti
|____chouti
| |_____spiders
| |__ __pycache__
| |__ __init__
| |__ get_chouti
| |___ __pycache__
| |_____ __init__.py
| |_____ items.py # 实例化response
| |_____ middlewares.py
| |_____ pipelines.py # 消息持久化,将接收到的response通过items实例化保存至本地
| |_____ settings.py # 配置文件
|_____scrapy.cfg
e.scrapy crawl name --nolog (nolog不显示日志)
####运行后无内容且代码中有输出代码,查看日志报错403,打开setting文件,配置USER_AGENT
举例:获取豆瓣图书top250的前25
get_chouti.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from scrapy.http import Request
from ..items import ChoutiItem
import time
class GetChoutiSpider(scrapy.Spider):
name = 'get_chouti'
allowed_domains = ['douban.com'] # 对应的一级域名,之后的所有请求必须对应,可设置dont_filter=False取消筛选
start_urls = ['https://book.douban.com/top250?icn=index-book250-all']
# def start_requests(self):
# for url in self.start_urls:
# yield Request(url,callback=self.parse)
def parse(self, response): # parse函数是默认执行的,但是可以通过start_requests重写
table_hxs = Selector(response).xpath('//div[@class="article"]/div[@class="indent"]/table') # 选择器
# 获取书名、作者等信息
for one_table in table_hxs:
book_item = ChoutiItem()
title = one_table.xpath('.//a/@title').extract()[0]
info = one_table.xpath('.//p[@class="pl"]/text()').extract()[0]
#print(title)
book_item["title"] = title
book_item["info"] = info
# 获取书籍链接
time.sleep(10) # 豆瓣限制访问频率
url = one_table.xpath('.//a[re:test(@href, "https://book.douban.com/subject/\d+")]/@href').extract()
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
#将连接放入调度器
yield Request(url=url[0],method="GET",headers=headers,callback=self.get_book_msg,meta={"book_item":book_item})
def get_book_msg(self,response,*args,**kwargs):
book_item_used = response.meta["book_item"]
# 获取内容简介和作者简介
info_hxs = Selector(response).xpath("//div[@class='related_info']//div[re:test(@class,'indent*')]")
content_hxs = info_hxs[0].xpath(".//div[@class='intro']/p/text()").extract()
content = ''
for i in content_hxs:
if i in content:
pass
else:
content += i
book_item_used['introduction'] = content
auth_hxs = info_hxs[1].xpath(".//div[@class='intro']/p/text()").extract()
auth_info = ''
for i in auth_hxs:
if i in auth_info:
pass
else:
auth_info += i
book_item_used['author'] = auth_info
yield book_item_used
# 将书籍的item对像传入pipeline,需要在setting文件中注册
#ITEM_PIPELINES = {
# 'chouti.pipelines.ChoutiPipeline': 300, # 300是权重
#}
print(response)
items.py
import scrapy
class ChoutiItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
info = scrapy.Field()
introduction = scrapy.Field()
author = scrapy.Field()
pipelines.py
class ChoutiPipeline(object):
def process_item(self, item, spider):
book = "书名:%s\r\n基本信息:%s\r\n书籍简介:\r\n%s\r\n作者简介:\r\n%s\r\n\r\n" %(item["title"],item["info"],item["introduction"],item["author"])
#print(book)
with open("new.json","a") as f:
f.write(book)
# 自定制命令
"""
a. 在spider同级创建目录 commands
b. 在目录中创建 crawlall.py文件 文件名 ==> 命令名
from scrapy.commands import ScrapyCommand
from scrapy.utils.project import get_project_settings
class Command(ScrapyCommand):
"""执行所有的spider"""
requires_project = True
def syntax(self):
return '[options]'
def short_desc(self):
return 'Runs all of the spiders'
def run(self, args, opts):
spider_list = self.crawler_process.spiders.list()
for name in spider_list:
self.crawler_process.crawl(name, **opts.__dict__)
self.crawler_process.start()
c. 在settings中配置 COMMANDS_MODULE = '项目名称.目录名称'
"""
# pipeline(管道)
"""
上面的pipeline每次持久化数据的时候都会有文件的打开关闭操作,可以在爬虫开始时打开文件,结束时再关闭文件
class ChoutiPipeline(object):
def __init__(self,crawler):
self.crawler = crawler
pass
@classmethod
def from_crawler(cls,crawler): # crawler.settings.get() 可以获取settings文件中的所有变量,变量名必须大写
return cls()
def open_spider(self,item,spider):
self.f = open(name,mode)
def close_spider(self,item,spider):
self.f.close()
def process_item(self, item, spider):
self.f.write(data)
# return item 这里会将item传给权重仅次于这个pipeline的下一个pipeline
# raise Dropitem from scrapy.exceptions import DropItem,如果不想继续向下发送,就报Dropitem错误
"""
# Https访问
"""
Https访问时有两种情况:
1. 要爬取网站使用的可信任证书(默认支持),服务器需要购买证书,该证书在客户端自带,访问时会根据返回的证书进行认证
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
2. 要爬取网站使用的自定义证书,
DOWNLOADER_HTTPCLIENTFACTORY = "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
DOWNLOADER_CLIENTCONTEXTFACTORY = "step8_king.https.MySSLFactory"
# https.py
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
from twisted.internet.ssl import (optionsForClientTLS, CertificateOptions, PrivateCertificate)
class MySSLFactory(ScrapyClientContextFactory):
def getCertificateOptions(self):
from OpenSSL import crypto
v1 = crypto.load_privatekey(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.key.unsecure', mode='r').read())
v2 = crypto.load_certificate(crypto.FILETYPE_PEM, open('/Users/wupeiqi/client.pem', mode='r').read())
return CertificateOptions(
privateKey=v1, # pKey对象
certificate=v2, # X509对象
verify=False,
method=getattr(self, 'method', getattr(self, '_ssl_method', None))
)
其他:
相关类
scrapy.core.downloader.handlers.http.HttpDownloadHandler
scrapy.core.downloader.webclient.ScrapyHTTPClientFactory
scrapy.core.downloader.contextfactory.ScrapyClientContextFactory
相关配置
DOWNLOADER_HTTPCLIENTFACTORY
DOWNLOADER_CLIENTCONTEXTFACTORY
"""
# 选择器 Selector
"""
// # 表示子孙中
.// # 当前对象的子孙中
/ # 儿子
/div # 儿子中的div标签
/div[@id="i1"] # 儿子中的div标签且id=i1
/div[@id="i1"] # 儿子中的div标签且id=i1
obj.extract() # 列表中的每一个对象转换字符串 =》 []
obj.extract_first() # 列表中的每一个对象转换字符串 => 列表第一个元素
//div/text() # 获取某个标签的文本
//a[@id='top']/@href # 取到id为top的a标签,并取出属性href的值
//div[re:test(@class,'indent*')] # 使用正则表达式匹配
# from scrapy.selector import Selector # 导入模块
# hxs = Selector(response) # 将返回的信息包装成selector对象
# hxs = Selector(response=response).xpath('//a') # 找到子孙中所有的a标签
# hxs = Selector(response=response).xpath('//a[2]') # 取到第三个a标签
# hxs = Selector(response=response).xpath('//a[@id]') # 找到有id属性的a标签
# hxs = Selector(response=response).xpath('//a[@id="i1"]') # 找到id属性为i1的a标签
# hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') # 多属性与匹配
# hxs = Selector(response=response).xpath('//a[contains(@href, "link")]' # 找到href属性包含link的a标签
# hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]') # 找到href属性是link起始的a标签
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]') # 对id属性正则匹配
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract() # 将对象转换为字符串
# hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract() # /@href 取到属性
# hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() # 取到列表中的第一个
"""
# setting.py 配置文件
"""
DEPTH_LIMIT = 2 # 访问层数
DUPEFILTER_CLASS = "chouti.dupefilter.Repeatfilter" # 使用自定义的过滤类
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36' # 伪装成浏览器
ITEM_PIPELINES = {
'chouti.pipelines.ChoutiPipeline': 300, # 300是权重,当item传送到pipeline时会经过所有定义的pipeline类
} # 权重就是决定经过他们的顺序,当然是否经过可以控制
EXTENSIONS = { # 在信号上扩展操作
'scrapy.extensions.telnet.TelnetConsole': None,
}
#==>第一部分:基本配置<===
#1、项目名称,默认的USER_AGENT由它来构成,也作为日志记录的日志名
BOT_NAME = 'Amazon'
#2、爬虫应用路径
SPIDER_MODULES = ['Amazon.spiders']
NEWSPIDER_MODULE = 'Amazon.spiders'
#3、客户端User-Agent请求头
#USER_AGENT = 'Amazon (+http://www.yourdomain.com)'
#4、是否遵循爬虫协议
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
#5、是否支持cookie,cookiejar进行操作cookie,默认开启
#COOKIES_ENABLED = False
#6、Telnet用于查看当前爬虫的信息,操作爬虫等...使用telnet ip port ,然后通过命令操作
#TELNETCONSOLE_ENABLED = False
#TELNETCONSOLE_HOST = '127.0.0.1'
#TELNETCONSOLE_PORT = [6023,]
#7、Scrapy发送HTTP请求默认使用的请求头
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
#===>第二部分:并发与延迟<===
#1、下载器总共最大处理的并发请求数,默认值16
#CONCURRENT_REQUESTS = 32
#2、每个域名能够被执行的最大并发请求数目,默认值8
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#3、能够被单个IP处理的并发请求数,默认值0,代表无限制,需要注意两点
#I、如果不为零,那CONCURRENT_REQUESTS_PER_DOMAIN将被忽略,即并发数的限制是按照每个IP来计算,而不是每个域名
#II、该设置也影响DOWNLOAD_DELAY,如果该值不为零,那么DOWNLOAD_DELAY下载延迟是限制每个IP而不是每个域
#CONCURRENT_REQUESTS_PER_IP = 16
#4、如果没有开启智能限速,这个值就代表一个规定死的值,代表对同一网址延迟请求的秒数
#DOWNLOAD_DELAY = 3
#===>第三部分:智能限速/自动节流:AutoThrottle extension<===
#一:介绍
from scrapy.contrib.throttle import AutoThrottle #http://scrapy.readthedocs.io/en/latest/topics/autothrottle.html#topics-autothrottle
设置目标:
1、比使用默认的下载延迟对站点更好
2、自动调整scrapy到最佳的爬取速度,所以用户无需自己调整下载延迟到最佳状态。用户只需要定义允许最大并发的请求,剩下的事情由该扩展组件自动完成
#二:如何实现?
在Scrapy中,下载延迟是通过计算建立TCP连接到接收到HTTP包头(header)之间的时间来测量的。
注意,由于Scrapy可能在忙着处理spider的回调函数或者无法下载,因此在合作的多任务环境下准确测量这些延迟是十分苦难的。 不过,这些延迟仍然是对Scrapy(甚至是服务器)繁忙程度的合理测量,而这扩展就是以此为前提进行编写的。
#三:限速算法
自动限速算法基于以下规则调整下载延迟
#1、spiders开始时的下载延迟是基于AUTOTHROTTLE_START_DELAY的值
#2、当收到一个response,对目标站点的下载延迟=收到响应的延迟时间/AUTOTHROTTLE_TARGET_CONCURRENCY
#3、下一次请求的下载延迟就被设置成:对目标站点下载延迟时间和过去的下载延迟时间的平均值
#4、没有达到200个response则不允许降低延迟
#5、下载延迟不能变的比DOWNLOAD_DELAY更低或者比AUTOTHROTTLE_MAX_DELAY更高
#四:配置使用
#开启True,默认False
AUTOTHROTTLE_ENABLED = True
#起始的延迟
AUTOTHROTTLE_START_DELAY = 5
#最小延迟
DOWNLOAD_DELAY = 3
#最大延迟
AUTOTHROTTLE_MAX_DELAY = 10
#每秒并发请求数的平均值,不能高于 CONCURRENT_REQUESTS_PER_DOMAIN或CONCURRENT_REQUESTS_PER_IP,调高了则吞吐量增大强奸目标站点,调低了则对目标站点更加”礼貌“
#每个特定的时间点,scrapy并发请求的数目都可能高于或低于该值,这是爬虫视图达到的建议值而不是硬限制
AUTOTHROTTLE_TARGET_CONCURRENCY = 16.0
#调试
AUTOTHROTTLE_DEBUG = True
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
#===>第四部分:爬取深度与爬取方式<===
#1、爬虫允许的最大深度,可以通过meta查看当前深度;0表示无深度
# DEPTH_LIMIT = 3
#2、爬取时,0表示深度优先Lifo(默认);1表示广度优先FiFo
# 后进先出,深度优先
# DEPTH_PRIORITY = 0
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
# 先进先出,广度优先
# DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
#3、调度器队列
# SCHEDULER = 'scrapy.core.scheduler.Scheduler'
# from scrapy.core.scheduler import Scheduler
#4、访问URL去重
# DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl'
#===>第五部分:中间件、Pipelines、扩展<===
#1、Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Amazon.middlewares.AmazonSpiderMiddleware': 543,
#}
#2、Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'Amazon.middlewares.DownMiddleware1': 543,
}
#3、Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
#4、Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'Amazon.pipelines.CustomPipeline': 200,
}
#===>第六部分:缓存<===
"""
1. 启用缓存
目的用于将已经发送的请求或相应缓存下来,以便以后使用
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否启用缓存策略
# HTTPCACHE_ENABLED = True
# 缓存策略:所有请求均缓存,下次在请求直接访问原来的缓存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 缓存策略:根据Http响应头:Cache-Control、Last-Modified 等进行缓存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
# 缓存超时时间
# HTTPCACHE_EXPIRATION_SECS = 0
# 缓存保存路径
# HTTPCACHE_DIR = 'httpcache'
# 缓存忽略的Http状态码
# HTTPCACHE_IGNORE_HTTP_CODES = []
# 缓存存储的插件
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
"""
# cookie
"""
# 导入模块
from scrapy.http.cookies import CookieJar
#使用
cj = CookieJar()
cj.extract_cookies(response,response.request)
print(cj._cookies) # 这就是获取的cookie
"""
# 扩展
"""
可以对整个爬虫的所有环节进行扩展,比如爬虫开始时可以执行某些操作,yield Request()使用调度器时可以执行某些操作
a.创建一个py文件,在里面创建如下类,类名可自定义
from scrapy import signals
class TelnetConsole(protocol.ServerFactory):
def __init__(self, crawler):
if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured(
'TELNETCONSOLE_ENABLED setting is True but required twisted '
'modules failed to import:\n' + _TWISTED_CONCH_TRACEBACK)
self.crawler = crawler
self.noisy = False
self.portrange = [int(x) for x in crawler.settings.getlist('TELNETCONSOLE_PORT')]
self.host = crawler.settings['TELNETCONSOLE_HOST']
self.username = crawler.settings['TELNETCONSOLE_USERNAME']
self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
if not self.password:
self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
logger.info('Telnet Password: %s', self.password)
self.crawler.signals.connect(self.start_listening, signals.engine_started) # 这就相当于就我们定义的函数注册到相应的信号上
# 前一个参数是需要执行的操作,后一个是信号
# 可供选择的信号如下
engine_started = object()
engine_stopped = object()
spider_opened = object()
spider_idle = object()
spider_closed = object()
spider_error = object()
request_scheduled = object()
request_dropped = object()
request_reached_downloader = object()
response_received = object()
response_downloaded = object()
item_scraped = object()
item_dropped = object()
item_error = object()
# for backwards compatibility
stats_spider_opened = spider_opened
stats_spider_closing = spider_closed
stats_spider_closed = spider_closed
item_passed = item_scraped
request_received = request_scheduled
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def start_listening(self):
pass
"""
# 代理
"""
需要在环境变量中设置
from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware
方式一:使用默认
os.environ
{
http_proxy:http://root:woshiniba@192.168.11.11:9999/
https_proxy:http://192.168.11.11:9999/
}
方式二:使用自定义下载中间件
def to_bytes(text, encoding=None, errors='strict'):
if isinstance(text, bytes):
return text
if not isinstance(text, six.string_types):
raise TypeError('to_bytes must receive a unicode, str or bytes '
'object, got %s' % type(text).__name__)
if encoding is None:
encoding = 'utf-8'
return text.encode(encoding, errors)
class ProxyMiddleware(object):
def process_request(self, request, spider):
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},
]
proxy = random.choice(PROXIES)
if proxy['user_pass'] is not None:
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
encoded_user_pass = base64.encodestring(to_bytes(proxy['user_pass']))
request.headers['Proxy-Authorization'] = to_bytes('Basic ' + encoded_user_pass)
print "**************ProxyMiddleware have pass************" + proxy['ip_port']
else:
print "**************ProxyMiddleware no pass************" + proxy['ip_port']
request.meta['proxy'] = to_bytes("http://%s" % proxy['ip_port'])
# 在settings中配置
DOWNLOADER_MIDDLEWARES = {
'step8_king.middlewares.ProxyMiddleware': 500,
}
"""
# URL去重
"""
先在setting中配置
DUPEFILTER_CLASS = "chouti.dupefilter.Repeatfilter" # 使用自定义的过滤类
创建一个py文件,写入自定义的filter类用于对要访问的URL进行过滤筛选,返回False则过滤掉
class Repeatfilter(object):
@classmethod
def from_settings(cls, settings): # 自动创建类
return cls()
def request_seen(self, request): # 对request.url进行过滤筛选
return False
def open(self): # can return deferred
pass
def close(self, reason): # can return a deferred
pass
def log(self, request, spider): # log that a request has been filtered
pass
"""