Python爬虫 #021 Scrapy综合案例

读书网是一个用来练习爬虫非常不错的网址,没有严格的反爬手段,当然我们应该要以学习的目的去练习,而不应对网站的恶意攻击
读书网网址:https://www.dushu.com/

1. 传统方法

  • 建立项目:

  • 设置中间件 middlewares.py:默认即可

  • 修改 setting.py

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for book project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'book'
    
    SPIDER_MODULES = ['book.spiders']
    NEWSPIDER_MODULE = 'book.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'book (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    #DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.3991.400'
    }
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'book.middlewares.BookSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #DOWNLOADER_MIDDLEWARES = {
    #    'book.middlewares.BookDownloaderMiddleware': 543,
    #}
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'book.pipelines.BookPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
  • 明确目标 item.py :

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class BookItem(scrapy.Item):
        title = scrapy.Field()
        author = scrapy.Field()
        content = scrapy.Field()
    
  • 编写爬虫 dushu.py

    # -*- coding: utf-8 -*-
    import scrapy
    from book.items import BookItem
    
    class DushuSpider(scrapy.Spider):
        name = 'dushu'
        allowed_domains = ['dushu.com']
        start_urls = ['https://www.dushu.com/']
    
        def parse(self, response):
            book_list = response.xpath('//div[@class="class-nav"]/a')
            for books in book_list:
                url = books.xpath('./@href').get()
                # 不止一个div[@class="class-nav"],当遇到/book/结束循环下面的就会被过滤
                if url == '/book/':
                    break
                else:
                    url = response.urljoin(url)
    
                yield scrapy.Request(url=url, callback=self.parse_detail)
    
    
        def parse_detail(self, response):
            books = response.xpath('.//div[@class="bookslist"]/ul//li')
            # print(books)
            for book in books:
                title = book.xpath('.//h3/a/text()').get()
                print(title)
                author = book.xpath('./div/p[1]/text()').get()
                print(author)
                content = book.xpath('./div/p[2]/text()').get()
                print(content)
                item = BookItem()
                item['title'] = title
                item['author'] = author
                item['content'] = content
                yield item
    
            # 第一页只有一个a[@class="disabled"], 即下一页,
            # 第二页有两个a[@class="disabled"],第一个为上一页,第二个为下一页
            next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]')
            if next_page == []:
                next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"]/@href').get()
            else:
                next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]/@href').get()
            print(next_page)
            # 获取下一页的链接,再一次请求,调用函数parse
            yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_detail)
    
  • 编写管道 pipeline.py:

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class BookPipeline(object):
        def __init__(self):
            self.fp = open('books.txt', mode='w', encoding='utf-8')
    
        def open_spider(self, spider):
            print('爬虫开始了----------------------------------------')
    
        def process_item(self, item, spider):
            self.fp.write(item['title'] + '    作者:' + item['author'] + '\n' + item['content'] + '\n' + '\n')
            return item
    
        def close_spider(self, spider):
            print('爬虫结束了-----------------------------------------')
            self.fp.close()
    
  • 最终效果,在当前目录中生成books.txt文件:


2. CrawlSpider

  • 建立项目:

  • 设置中间件 middlewares.py:默认即可

  • 修改 setting.py:

  # -*- coding: utf-8 -*-

  # Scrapy settings for dushu project
  #
  # For simplicity, this file contains only settings considered important or
  # commonly used. You can find more settings consulting the documentation:
  #
  #     https://docs.scrapy.org/en/latest/topics/settings.html
  #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

  BOT_NAME = 'dushu'

  SPIDER_MODULES = ['dushu.spiders']
  NEWSPIDER_MODULE = 'dushu.spiders'


  # Crawl responsibly by identifying yourself (and your website) on the user-agent
  #USER_AGENT = 'dushu (+http://www.yourdomain.com)'

  # Obey robots.txt rules
  ROBOTSTXT_OBEY = False

  # Configure maximum concurrent requests performed by Scrapy (default: 16)
  #CONCURRENT_REQUESTS = 32

  # Configure a delay for requests for the same website (default: 0)
  # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
  # See also autothrottle settings and docs
  DOWNLOAD_DELAY = 3
  # The download delay setting will honor only one of:
  #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  #CONCURRENT_REQUESTS_PER_IP = 16

  # Disable cookies (enabled by default)
  #COOKIES_ENABLED = False

  # Disable Telnet Console (enabled by default)
  #TELNETCONSOLE_ENABLED = False

  # Override the default request headers:
  DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
  }

  # Enable or disable spider middlewares
  # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  #SPIDER_MIDDLEWARES = {
  #    'dushu.middlewares.DushuSpiderMiddleware': 543,
  #}

  # Enable or disable downloader middlewares
  # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  #DOWNLOADER_MIDDLEWARES = {
  #    'dushu.middlewares.DushuDownloaderMiddleware': 543,
  #}

  # Enable or disable extensions
  # See https://docs.scrapy.org/en/latest/topics/extensions.html
  #EXTENSIONS = {
  #    'scrapy.extensions.telnet.TelnetConsole': None,
  #}

  # Configure item pipelines
  # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  ITEM_PIPELINES = {
     'dushu.pipelines.DushuTextPipeline': 100,
     'dushu.pipelines.DushuJsonPipeline': 200,
  }

  # Enable and configure the AutoThrottle extension (disabled by default)
  # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
  #AUTOTHROTTLE_ENABLED = True
  # The initial download delay
  #AUTOTHROTTLE_START_DELAY = 5
  # The maximum download delay to be set in case of high latencies
  #AUTOTHROTTLE_MAX_DELAY = 60
  # The average number of requests Scrapy should be sending in parallel to
  # each remote server
  #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  # Enable showing throttling stats for every response received:
  #AUTOTHROTTLE_DEBUG = False

  # Enable and configure HTTP caching (disabled by default)
  # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  #HTTPCACHE_ENABLED = True
  #HTTPCACHE_EXPIRATION_SECS = 0
  #HTTPCACHE_DIR = 'httpcache'
  #HTTPCACHE_IGNORE_HTTP_CODES = []
  #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 明确目标 items.py
  # -*- coding: utf-8 -*-

  # Define here the models for your scraped items
  #
  # See documentation in:
  # https://docs.scrapy.org/en/latest/topics/items.html

  import scrapy


  class DushuItem(scrapy.Item):
      title = scrapy.Field()
      author = scrapy.Field()
      content = scrapy.Field()
      state = scrapy.Field()
  • 编写爬虫 dsw.py:
  # -*- coding: utf-8 -*-
  import scrapy
  from scrapy.linkextractors import LinkExtractor
  from scrapy.spiders import CrawlSpider, Rule
  from dushu.items import DushuItem

  class DswSpider(CrawlSpider):
      name = 'dsw'
      allowed_domains = ['dushu.com']
      # 改为开始的网址
      start_urls = ['https://www.dushu.com/']


      rules = (
          Rule(LinkExtractor(allow=r'.+/book/.+\.html'), callback='parse_detail', follow=True),
      )

      def parse_detail(self, response):
          # 可查访问的网页,判断是否规范
          # print(response.url)
          book_list = response.xpath('//div[@class="bookslist"]/ul/li')
          for book in book_list:
              try:
                  title = book.xpath('//h3/a/text()').get()
                  author = book.xpath('./div/p[1]/text()').get()
                  content = book.xpath('./div/p[2]/text()').get()
                  # 可购状态
                  state = book.xpath('./div/p[3]/span/text()').get()

                  item = DushuItem(
                      title = title,
                      author = author,
                      content = content,
                      state = state
                  )
                  yield item
              except Exception as result:
                  print(result)
                  continue
  • 编写管道 pipelines.py:
  # -*- coding: utf-8 -*-

  # Define your item pipelines here
  #
  # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

  # 方法一:保存为文本数据
  class DushuTextPipeline(object):

      def __init__(self):
          self.fp = open('books.txt',mode='w',encoding='utf-8')

      def open_spider(self, spider):
          print('爬虫开始了--------------------------------------TEXT--------------------------------------')

      def process_item(self, item, spider):
          self.fp.write(item['title'] + "作者:" + item['author'] + '\n' + item['content'] + '\n' + '状态:' + item['state'] + '\n' + '\n')

          return item

      def close_spider(self, spider):
          self.fp.close()
          print('爬虫结束了-----------------------------------------TEXT-----------------------------------')


  # 方法二:保存为json数据
  from scrapy.exporters import JsonLinesItemExporter

  class DushuJsonPipeline(object):

      def __init__(self):
          self.books_fp = open('books.json',mode='wb')
          self.books_exporter = JsonLinesItemExporter(self.books_fp, ensure_ascii=False)

      def open_spider(self, spider):
          print('爬虫开始了-----------------------------JSON-----------------------------------------------')

      def process_item(self, item, spider):
          self.books_exporter.export_item(item)
          return item

      def close_spider(self, spider):
          self.books_fp.close()
          print('爬虫结束了-------------------------------JSON---------------------------------------------')
  • 运行效果图:在管道中编写了两种储存数据的方法


3. 中间件之selenium

  • 创建项目:

  • 设置中间件 middlewares.py

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    from selenium import webdriver
    from scrapy.http.response.html import HtmlResponse
    
    # setting中改为自定义的中间件
    class SeleniumDownloadMiddleware(object):
    
        def __init__(self):
            self.driver = webdriver.Chrome()
    
        # 截获scrapy发出的request,通过Chrome浏览器发出
        def process_request(self, request, spider):
            self.driver.get(request.url)
            self.driver.implicitly_wait(10)
            source = self.driver.page_source
            # 把网页源代码封装成response对象,返回给爬虫
            # current_url即当前访问的url
            response = HtmlResponse(url=self.driver.current_url, body=source, request=request,encoding='utf-8')
            return response
    
  • 修改 setting.py:

    # -*- coding: utf-8 -*-
    
    # Scrapy settings for dushu project
    #
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #
    #     https://docs.scrapy.org/en/latest/topics/settings.html
    #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    
    BOT_NAME = 'dushu'
    
    SPIDER_MODULES = ['dushu.spiders']
    NEWSPIDER_MODULE = 'dushu.spiders'
    
    
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    #USER_AGENT = 'dushu (+http://www.yourdomain.com)'
    
    # Obey robots.txt rules
    ROBOTSTXT_OBEY = False
    
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    #CONCURRENT_REQUESTS = 32
    
    # Configure a delay for requests for the same website (default: 0)
    # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    DOWNLOAD_DELAY = 3
    # The download delay setting will honor only one of:
    #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    #CONCURRENT_REQUESTS_PER_IP = 16
    
    # Disable cookies (enabled by default)
    #COOKIES_ENABLED = False
    
    # Disable Telnet Console (enabled by default)
    #TELNETCONSOLE_ENABLED = False
    
    # Override the default request headers:
    DEFAULT_REQUEST_HEADERS = {
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Language': 'en',
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
    }
    
    # Enable or disable spider middlewares
    # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    #SPIDER_MIDDLEWARES = {
    #    'dushu.middlewares.DushuSpiderMiddleware': 543,
    #}
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
        # 取消原来默认的
       # 'dushu.middlewares.DushuDownloaderMiddleware': 543,
       'dushu.middlewares.SeleniumDownloadMiddleware': 543,
    }
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    #EXTENSIONS = {
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    #}
    
    # Configure item pipelines
    # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       # 注释原来的
       # 'dushu.pipelines.DushuPipeline': 300,
       'dushu.pipelines.DushuTextPipeline': 300,
    }
    
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    #AUTOTHROTTLE_ENABLED = True
    # The initial download delay
    #AUTOTHROTTLE_START_DELAY = 5
    # The maximum download delay to be set in case of high latencies
    #AUTOTHROTTLE_MAX_DELAY = 60
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    # Enable showing throttling stats for every response received:
    #AUTOTHROTTLE_DEBUG = False
    
    # Enable and configure HTTP caching (disabled by default)
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_ENABLED = True
    #HTTPCACHE_EXPIRATION_SECS = 0
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_IGNORE_HTTP_CODES = []
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
    
  • 明确目标 items.py:

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://docs.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class DushuItem(scrapy.Item):
        book_class = scrapy.Field()
        title = scrapy.Field()
        author = scrapy.Field()
        content = scrapy.Field()
        state = scrapy.Field()
    
  • 编写 dsw.py:

    # -*- coding: utf-8 -*-
    import scrapy
    from dushu.items import DushuItem
    
    class DswSpider(scrapy.Spider):
        name = 'dsw'
        allowed_domains = ['dushu.com']
        # 改开始的网址
        start_urls = ['https://www.dushu.com/']
    
        def parse(self, response):
            book_list = response.xpath('//div[@class="class-nav"]/a')
            for books in book_list:
                url = books.xpath('./@href').get()
                # 不止一个div[@class="class-nav"],当遇到/book/结束循环下面的就会被过滤
                if url == '/book/':
                    break
                else:
                    url = response.urljoin(url)
    
                yield scrapy.Request(url=url, callback=self.parse_detail)
    
    
        def parse_detail(self, response):
    
            books = response.xpath('.//div[@class="bookslist"]/ul//li')
            # print(books)
            for book in books:
                title = book.xpath('.//h3/a/text()').get()
                # print(title)
                author = book.xpath('./div/p[1]/text()').get()
                # print(author)
                content = book.xpath('./div/p[2]/text()').get()
                # print(content)
                state = book.xpath('./div/p[3]/span/text()').get()
    
                item = DushuItem()
                item['title'] = title
                item['author'] = author
                item['content'] = content
                item['state'] = state
                yield item
    
            # 第一页只有一个a[@class="disabled"], 即下一页,
            # 第二页有两个a[@class="disabled"],第一个为上一页,第二个为下一页
            next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]')
            if next_page == []:
                next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"]/@href').get()
            else:
                next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]/@href').get()
            # print(next_page)
            # print(self.base_url + next_page)
            # 获取下一页的链接,再一次请求,调用函数parse
            yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_detail)
    
  • 编写管道 pipelines.py:

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    # 方法一:保存为文本数据
    class DushuTextPipeline(object):
    
        def __init__(self):
            self.fp = open('books.txt',mode='w',encoding='utf-8')
    
        def open_spider(self, spider):
            print('爬虫开始了--------------------------------------TEXT--------------------------------------')
    
        def process_item(self, item, spider):
            self.fp.write(item['title'] + "作者:" + item['author'] + '\n' + item['content'] + '\n' + '状态:' + item['state'] + '\n' + '\n')
    
            return item
    
        def close_spider(self, spider):
            self.fp.close()
            print('爬虫结束了-----------------------------------------TEXT-----------------------------------')
    
  • 运行效果:

posted @ 2023-06-28 23:02  枫_Null  阅读(11)  评论(0)    收藏  举报