twisted 模拟scrapy调度循环
"""
模拟scrapy调度循环
"""
from ori_test import pr_type
import logging
import time
from twisted.internet import defer, task, reactor
from scrapy.utils.reactor import CallLaterOnce
log = logging.getLogger(__name__)
logger_m = log
class Slot(object):
    def __init__(self, start_requests, nextcall):
        self.start_requests = iter(start_requests)
        self.nextcall = nextcall
        self.heartbeat = task.LoopingCall(nextcall.schedule)
class Engine():
    @defer.inlineCallbacks
    def open_spider(self, spider, start_requests):
        nextcall = CallLaterOnce(self._next_request, spider)
        #start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
        slot = Slot(start_requests, nextcall)
        self.slot = slot
        self.spider = spider
        yield 8
        slot.nextcall.schedule()
        slot.heartbeat.start(5)
    def _next_request(self, spider):
        print('next_request')
        slot = self.slot
        if not slot:
            return
        try:
            v = next(slot.start_requests)
        except StopIteration:
            slot.start_requests = None
        except Exception:
            slot.start_requests = None
        else:
            def task_print(k):
                #k = next(slot.start_requests)
                d = defer.Deferred()
                reactor.callLater(0, d.callback, k)
                return d
            d = task_print(v)
            d.addCallback(lambda x: print('d is done', x))
            d.addErrback(lambda x: print('d is not done'))
            d.addCallback(self._next_request)
            d.addErrback(lambda x: print('d is error'))
class Spider():
    def __init__(self):
        pass
        self._before_start()
    def start_requests(self):
        logger_m.info('start spider the first page!')
        # USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        for x in range(1000):
            #time.sleep(3)
            print('from spider yield', x )
            yield x
        #print(x)
        # req = scrapy.Request(url='https://httpbin.org/ip', dont_filter=True)
        yield 8888
    def _before_start(self):
        # 注册爬虫,信息内容为{spider_id, time()}
        def spider_alive():
            print('from looptask:spider alive')
            # print(res)
            pass
        # 添加到定时任务,定时声明爬虫生死
        # twisted 任务
        looptask = task.LoopingCall(spider_alive)
        looptask.start(5, now=True)
        #looptask.start(60, now=True)
spider = Spider()
s_r = spider.start_requests()
pr_type(s_r)
s_r = iter(s_r)
pr_type(s_r)
a = Engine()
a.open_spider(spider, s_r)
reactor.run()
 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号