from twisted.web.client import getPage #模块功能:socket对象,自动完成移除
from twisted.internet import reactor #模块功能:事件循环(所有的socket对象都移除)
from twisted.internet import defer #模块功能:defer.Deferred,特殊的socket对象,不发请求,需手动移除
from queue import Queue
class Request(object):
"""
用于封装用户请求相关信息
"""
def __init__(self,url,callback):
self.url = url
self.callback = callback
class HttpResponse(object):
#将下载结果和request封装成一个类,以后方法解析好,类.xxx就能取到所有内容
def __init__(self,content,request):
self.content = content
self.request = request
class Scheduler(object):
"""
任务调度器
"""
def __init__(self):
self.q = Queue()
def open(self):
pass
def next_request(self):
try:
req = self.q.get(block=False)
except Exception as e:
req = None
return req
def enqueue_request(self,req):
self.q.put(req)
def size(self):
return self.q.qsize()
class ExecutionEngine(object):
"""
引擎:所有的调度
"""
def __init__(self):
self._close = None
self.scheduler = None
self.max = 5
self.crawlling = []
def get_response_callback(self,content,request):
"""回调函数,传给parse和接受parse的返回值"""
self.crawlling.remove(request) # 移除
response = HttpResponse(content, request)
result = request.callback(response) # 即调用了parse
import types
if isinstance(result, types.GeneratorType): # 解析返回生成器的时候,再放到队列中
for req in result:
self.scheduler.enqueue_request(req)
def _next_request(self):
"""取"""
if self.scheduler.size() == 0 and len(self.crawlling) == 0:
self._close.callback(None)
return
while len(self.crawlling)<self.max:
req = self.scheduler.next_request()
if not req: # 还可以try,没有就except即可
return
self.crawlling.append(req)
d = getPage(req.url.encode('utf-8'))
d.addCallback(self.get_response_callback,req)
d.addCallback(lambda _: reactor.callLater(0, self._next_request))
@defer.inlineCallbacks
def open_spider(self,start_requests):
"""将初始请求加入调度器,然后开始取任务"""
self.scheduler = Scheduler()
while True:
try:
req = next(start_requests)
self.scheduler.enqueue_request(req)
except StopIteration as e:
break
yield self.scheduler.open() #加这个装饰器必须有yield,否则twisted报错,none表示没有影响,这个方法没有返回,用途是调用方法
reactor.callLater(0,self._next_request)
@defer.inlineCallbacks
def start(self):
"""创建Deferred对象"""
self._close = defer.Deferred()
yield self._close
class Crawler(object):
"""
用户封装调度器以及引擎,将初始爬虫交给引擎调度
"""
def _create_engine(self):
"""创建引擎对象"""
return ExecutionEngine()
def _creat_spider(self,spider_cls_path):
"""
根据spider路径创建spider对象
:param spider_cls_path:
:return:
"""
module_path,cls_name = spider_cls_path.rsplit('.',maxsplit=1) #模块路径,类名
import importlib #反射
m = importlib.import_module(module_path)
cls = getattr(m,cls_name)
return cls()
@defer.inlineCallbacks
def crawl(self,spider_cls_path):
"""创建引擎,创建spider,将初始url传给引擎open_spider加入队列,然后yield defered对象elf._close"""
engine = self._create_engine()
spider = self._creat_spider(spider_cls_path)
start_requests = iter(spider.start_requests())
yield engine.open_spider(start_requests) # 相当于将yield写到open_spider里
yield engine.start() #yield self._close
class CrawlerProcess(object):
"""
调用Crawler创建爬虫,开启事件循环
"""
def __init__(self):
self._active = set()
def crawl(self,spider_cls_path):
# 创建爬虫Crawler对象,这样把创建爬虫和将爬虫添加到循环的工作分开
crawler = Crawler()
d = crawler.crawl(spider_cls_path)
self._active.add(d)
def start(self):
"""
启动一次reactor,就算多个spider
:return:
"""
dd = defer.DeferredList(self._active)
dd.addBoth(lambda _:reactor.stop())
reactor.run()
class Command(object):
def run(self):
crawl_process = CrawlerProcess()
spider_cls_path_list = ['spider.chouti.ChoutiSpider',]
for spider_cls_path in spider_cls_path_list:
crawl_process.crawl(spider_cls_path)
crawl_process.start()
if __name__ == '__main__':
cmd = Command()
cmd.run()
from ..engine import Request
class ChoutiSpider(object):
name = 'chouti'
def start_requests(self):
start_url = ['https://www.baidu.com','https://www.bing.com',]
for url in start_url:
yield Request(url,self.parse)
def parse(self,response):
print(response)
# yield Request('https://www.baidu.com', callback=self.parse)
#1.crawling移除
#2.获取parser yield值
#3.再次取队列中获取