使用Scrapy爬取开源项目文档

import scrapy
import re
import os

class MySpider(scrapy.Spider):
  name            = "deeplearning4j-api"
  start_urls      = ["https://deeplearning4j.org/api/latest/index-files/index-1.html"]

  crawledLinks = {}

  # times = 200

  def parse(self, response):

    # 输出html页面
    def output2html(htmlcontent,filepath,folder='./document-deeplearning4j/'):
      filepath = folder + filepath
      if not os.path.exists(os.path.split(filepath)[0]):
        os.makedirs(os.path.split(filepath)[0])
      f = open(filepath, 'wb')
      f.write(htmlcontent)
      f.close()

    # 处理链接成统一格式
    def htmlprocess(url,preStr="https://deeplearning4j.org",endStr=".html"):
      url = url.replace(preStr,'')
      pattern = re.compile(r'\.html.*$')
      url = pattern.sub('.html',url)
      return url
    
    url =  htmlprocess( response.url )
    output2html(response.body,url)

    # 选取页面元素(找出所有连接)
    links = response.xpath('//a/@href').extract()
    # 限定域名(绝对链接)
    linkPattern = re.compile(r"(^https\:\/\/deeplearning4j\.org\/api\/)")
 
    for link in links:
      link = response.urljoin(link) # 将所有连接处理成绝对链接
      link = htmlprocess( link, preStr='',endStr=".html") # 不处理链接域名,不区分链接后缀
      if linkPattern.match(link) and not link in self.crawledLinks: 
        self.crawledLinks[link]=1
        yield scrapy.Request(link, callback = self.parse)
        
        item = {}
        item["link"] = link
        yield item

 

posted @ 2019-09-18 00:33  DanielOwen  阅读(312)  评论(0编辑  收藏  举报