scrapy爬行乌云网公开漏洞程序的分析

公告

Posted on 2018-02-25 23:04 王将军之武库阅读(258) 评论(0) 收藏举报

# -*- coding: utf-8 -*-
from datetime import datetime
import pymongo
import scrapy
from wooyun.items import WooyunItem
from scrapy.conf import settings


class WooyunSpider(scrapy.Spider):
    name = "wooyun"#蜘蛛名字，运行命令为：scrapy crawl wooyun
    allowed_domains = ["wooyun.org"]
    start_urls = [
        'http://wooyun.org/bugs/new_public/'
    ]#spider类会遍历该类变量for url in self.start_urls:yield Request(url, dont_filter=True)爬行的起点
                
    def __init__(self,page_max=settings['PAGE_MAX_DEFAULT'],local_store=settings['LOCAL_STORE_DEFAULT'],\
            update=settings['UPDATE_DEFAULT'],*args, **kwargs):
        self.page_max = int(page_max)
        self.local_store = 'true' == local_store.lower()
        self.update = 'true' == update.lower()

        self.connection_string = "mongodb://%s:%d" % (settings['MONGODB_SERVER'],settings['MONGODB_PORT'])
        self.client = pymongo.MongoClient(self.connection_string)
        self.db = self.client[settings['MONGODB_DB']]
        self.collection = self.db[settings['MONGODB_COLLECTION']]

    def closed(self,reason):
        self.client.close()

    def parse(self, response):#当爬行返回第一个响应时会调用这个函数
        total_pages = response.xpath("//p[@class='page']/text()").re('\d+')[1]
        if self.page_max == 0:
            end_page = int(total_pages)
        else:
            end_page = self.page_max
        for n in range(1,end_page + 1):
            page = "/bugs/new_public/page/%d" %n#乌云公开漏洞列表的一页
            url = response.urljoin(page)
            yield scrapy.Request(url, self.parse_list)#分析一页的漏洞列表

    def parse_list(self,response):#取得一页列表的链接
        links = response.xpath('//tbody/tr/td/a/@href').extract()
        for url in links:
            wooyun_id = url.split('/')[2]
            if self.update == True or self.__search_mongodb(wooyun_id) == False:
                url = response.urljoin(url)
                yield scrapy.Request(url, self.parse_detail)

    def parse_detail(self,response):#对每一个漏洞的内容的提取
        item = WooyunItem()
        item['wooyun_id'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[1]/a/@href').extract()[0].split('/')[2]
        item['title'] = response.xpath('//title/text()').extract()[0].split("|")[0]
        item['bug_type'] = response.xpath("//h3[@class='wybug_type']/text()").extract()[0].split(u'：')[1].strip()
        #item['bug_type'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[7]/text()').extract()[0].split(u'：')[1].strip()
        #some author not text,for examp:
        #http://wooyun.org/bugs/wooyun-2010-01010
        #there will be error while parse author,so do this
        try:
            #item['author'] = response.xpath("//h3[@class='wybug_author']/a/text()").extract()[0]
            item['author'] = response.xpath('//*[@id="bugDetail"]/div[5]/h3[4]/a/text()').extract()[0]
        except:
            item['author'] ='<Parse Error>'
        #the response.body type is str,so we need to convert to utf-8
        #if not utf-8,saving to mongodb may have some troubles
        item['html'] = response.body.decode('utf-8','ignore')
        #dt = response.xpath("//h3[@class='wybug_date']/text()").re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
        dt = response.xpath('//*[@id="bugDetail"]/div[5]/h3[5]/text()').re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
        item['datetime'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]))
        #dt = response.xpath("//h3[@class='wybug_open_date']/text()").re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
        dt = response.xpath('//*[@id="bugDetail"]/div[5]/h3[6]/text()').re("[\d+]{4}-[\d+]{2}-[\d+]{2}")[0].split('-')
        item['datetime_open'] = datetime(int(dt[0]),int(dt[1]),int(dt[2]))
        #images url for download
        item['image_urls']=[]
        if self.local_store:
            #乌云图片目前发两种格式，一种是http://static.wooyun.org/wooyun/upload/,另一格式是/upload/...
            #因此，对后一种在爬取时，增加http://www.wooyun.org，以形成完整的url地址
            #同时，在piplines.py存放时，作相应的反向处理
            image_urls = response.xpath("//img[contains(@src, '/upload/')]/@src").extract()
            for u in image_urls:
                if self.__check_ingnored_image(u):
                    continue
                if u.startswith('/'):
                    u = 'http://www.wooyun.org' + u
                item['image_urls'].append(u)
        return item #产生一个item项目

    def __check_ingnored_image(self,image_url):
        for ignored_url in settings['IMAGE_DOWLOAD_IGNORED']:
            if ignored_url in image_url:
                return True

        return False
        
    def __search_mongodb(self,wooyun_id):
        #
        wooyun_id_exsist = True if self.collection.find({'wooyun_id':wooyun_id}).count()>0 else False
        #
        return wooyun_id_exsist

scrapy.Request(url, self.parse_detail) Request对象有回调函数。Request对象放到scheduler的队列中，engine会从schedule中取得request。

scrapy的crawl命令调用的函数

def run(self, args, opts):
        if len(args) < 1:
            raise UsageError()
        elif len(args) > 1:
            raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
        spname = args[0]

        self.crawler_process.crawl(spname, **opts.spargs)#crawler_process爬行进程应该是ExecutionEngine实例
        self.crawler_process.start()

C:\Python27\Lib\site-packages\scrapy\cmdline.py

def execute(argv=None, settings=None):
    ......
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)#为命令对象添加爬行进程
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)

CrawlerProcess(settings)[C:\Python27\Lib\site-packages\scrapy\cawler.py]

class CrawlerProcess(CrawlerRunner):
    """
    A class to run multiple scrapy crawlers in a process simultaneously.

    This class extends :class:`~scrapy.crawler.CrawlerRunner` by adding support
    for starting a Twisted `reactor`_ and handling shutdown signals, like the
    keyboard interrupt command Ctrl-C. It also configures top-level logging.

    This utility should be a better fit than
    :class:`~scrapy.crawler.CrawlerRunner` if you aren't running another
    Twisted `reactor`_ within your application.

    The CrawlerProcess object must be instantiated with a
    :class:`~scrapy.settings.Settings` object.

    This class shouldn't be needed (since Scrapy is responsible of using it
    accordingly) unless writing scripts that manually handle the crawling
    process. See :ref:`run-from-script` for an example.
    """

    def __init__(self, settings=None):
        super(CrawlerProcess, self).__init__(settings)
        install_shutdown_handlers(self._signal_shutdown)
        configure_logging(self.settings)
        log_scrapy_info(self.settings)

    def _signal_shutdown(self, signum, _):
        install_shutdown_handlers(self._signal_kill)
        signame = signal_names[signum]
        logger.info("Received %(signame)s, shutting down gracefully. Send again to force ",
                    {'signame': signame})
        reactor.callFromThread(self._graceful_stop_reactor)

    def _signal_kill(self, signum, _):
        install_shutdown_handlers(signal.SIG_IGN)
        signame = signal_names[signum]
        logger.info('Received %(signame)s twice, forcing unclean shutdown',
                    {'signame': signame})
        reactor.callFromThread(self._stop_reactor)

    def start(self, stop_after_crawl=True):
        """
        This method starts a Twisted `reactor`_, adjusts its pool size to
        :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
        on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.

        If `stop_after_crawl` is True, the reactor will be stopped after all
        crawlers have finished, using :meth:`join`.

        :param boolean stop_after_crawl: stop or not the reactor when all
            crawlers have finished
        """
        if stop_after_crawl:
            d = self.join()
            # Don't start the reactor if the deferreds are already fired
            if d.called:
                return
            d.addBoth(self._stop_reactor)

        reactor.installResolver(self._get_dns_resolver())
        tp = reactor.getThreadPool()
        tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
        reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
        reactor.run(installSignalHandlers=False)  # blocking call

    def _get_dns_resolver(self):
        if self.settings.getbool('DNSCACHE_ENABLED'):
            cache_size = self.settings.getint('DNSCACHE_SIZE')
        else:
            cache_size = 0
        return CachingThreadedResolver(
            reactor=reactor,
            cache_size=cache_size,
            timeout=self.settings.getfloat('DNS_TIMEOUT')
        )

    def _graceful_stop_reactor(self):
        d = self.stop()
        d.addBoth(self._stop_reactor)
        return d

    def _stop_reactor(self, _=None):
        try:
            reactor.stop()
        except RuntimeError:  # raised if already stopped or in shutdown stage
            pass

刷新页面返回顶部