Python爬虫 #021 Scrapy综合案例

读书网是一个用来练习爬虫非常不错的网址，没有严格的反爬手段，当然我们应该要以学习的目的去练习，而不应对网站的恶意攻击
读书网网址：https://www.dushu.com/

1. 传统方法
2. CrawlSpider
3. 中间件之selenium

1. 传统方法

建立项目：
设置中间件 middlewares.py：默认即可

修改 setting.py：

# -*- coding: utf-8 -*-

# Scrapy settings for book project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'book'

SPIDER_MODULES = ['book.spiders']
NEWSPIDER_MODULE = 'book.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'book (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.3991.400'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'book.middlewares.BookSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'book.middlewares.BookDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'book.pipelines.BookPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

明确目标 item.py :

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BookItem(scrapy.Item):
    title = scrapy.Field()
    author = scrapy.Field()
    content = scrapy.Field()

编写爬虫 dushu.py：

# -*- coding: utf-8 -*-
import scrapy
from book.items import BookItem

class DushuSpider(scrapy.Spider):
    name = 'dushu'
    allowed_domains = ['dushu.com']
    start_urls = ['https://www.dushu.com/']

    def parse(self, response):
        book_list = response.xpath('//div[@class="class-nav"]/a')
        for books in book_list:
            url = books.xpath('./@href').get()
            # 不止一个div[@class="class-nav"]，当遇到/book/结束循环下面的就会被过滤
            if url == '/book/':
                break
            else:
                url = response.urljoin(url)

            yield scrapy.Request(url=url, callback=self.parse_detail)


    def parse_detail(self, response):
        books = response.xpath('.//div[@class="bookslist"]/ul//li')
        # print(books)
        for book in books:
            title = book.xpath('.//h3/a/text()').get()
            print(title)
            author = book.xpath('./div/p[1]/text()').get()
            print(author)
            content = book.xpath('./div/p[2]/text()').get()
            print(content)
            item = BookItem()
            item['title'] = title
            item['author'] = author
            item['content'] = content
            yield item

        # 第一页只有一个a[@class="disabled"]， 即下一页，
        # 第二页有两个a[@class="disabled"]，第一个为上一页，第二个为下一页
        next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]')
        if next_page == []:
            next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"]/@href').get()
        else:
            next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]/@href').get()
        print(next_page)
        # 获取下一页的链接，再一次请求，调用函数parse
        yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_detail)

编写管道 pipeline.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class BookPipeline(object):
    def __init__(self):
        self.fp = open('books.txt', mode='w', encoding='utf-8')

    def open_spider(self, spider):
        print('爬虫开始了----------------------------------------')

    def process_item(self, item, spider):
        self.fp.write(item['title'] + '    作者：' + item['author'] + '\n' + item['content'] + '\n' + '\n')
        return item

    def close_spider(self, spider):
        print('爬虫结束了-----------------------------------------')
        self.fp.close()

最终效果，在当前目录中生成books.txt文件：

2. CrawlSpider

建立项目：
设置中间件 middlewares.py：默认即可
修改 setting.py:

  # -*- coding: utf-8 -*-

  # Scrapy settings for dushu project
  #
  # For simplicity, this file contains only settings considered important or
  # commonly used. You can find more settings consulting the documentation:
  #
  #     https://docs.scrapy.org/en/latest/topics/settings.html
  #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

  BOT_NAME = 'dushu'

  SPIDER_MODULES = ['dushu.spiders']
  NEWSPIDER_MODULE = 'dushu.spiders'


  # Crawl responsibly by identifying yourself (and your website) on the user-agent
  #USER_AGENT = 'dushu (+http://www.yourdomain.com)'

  # Obey robots.txt rules
  ROBOTSTXT_OBEY = False

  # Configure maximum concurrent requests performed by Scrapy (default: 16)
  #CONCURRENT_REQUESTS = 32

  # Configure a delay for requests for the same website (default: 0)
  # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
  # See also autothrottle settings and docs
  DOWNLOAD_DELAY = 3
  # The download delay setting will honor only one of:
  #CONCURRENT_REQUESTS_PER_DOMAIN = 16
  #CONCURRENT_REQUESTS_PER_IP = 16

  # Disable cookies (enabled by default)
  #COOKIES_ENABLED = False

  # Disable Telnet Console (enabled by default)
  #TELNETCONSOLE_ENABLED = False

  # Override the default request headers:
  DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
  }

  # Enable or disable spider middlewares
  # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  #SPIDER_MIDDLEWARES = {
  #    'dushu.middlewares.DushuSpiderMiddleware': 543,
  #}

  # Enable or disable downloader middlewares
  # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  #DOWNLOADER_MIDDLEWARES = {
  #    'dushu.middlewares.DushuDownloaderMiddleware': 543,
  #}

  # Enable or disable extensions
  # See https://docs.scrapy.org/en/latest/topics/extensions.html
  #EXTENSIONS = {
  #    'scrapy.extensions.telnet.TelnetConsole': None,
  #}

  # Configure item pipelines
  # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  ITEM_PIPELINES = {
     'dushu.pipelines.DushuTextPipeline': 100,
     'dushu.pipelines.DushuJsonPipeline': 200,
  }

  # Enable and configure the AutoThrottle extension (disabled by default)
  # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
  #AUTOTHROTTLE_ENABLED = True
  # The initial download delay
  #AUTOTHROTTLE_START_DELAY = 5
  # The maximum download delay to be set in case of high latencies
  #AUTOTHROTTLE_MAX_DELAY = 60
  # The average number of requests Scrapy should be sending in parallel to
  # each remote server
  #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
  # Enable showing throttling stats for every response received:
  #AUTOTHROTTLE_DEBUG = False

  # Enable and configure HTTP caching (disabled by default)
  # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
  #HTTPCACHE_ENABLED = True
  #HTTPCACHE_EXPIRATION_SECS = 0
  #HTTPCACHE_DIR = 'httpcache'
  #HTTPCACHE_IGNORE_HTTP_CODES = []
  #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

明确目标 items.py：

  # -*- coding: utf-8 -*-

  # Define here the models for your scraped items
  #
  # See documentation in:
  # https://docs.scrapy.org/en/latest/topics/items.html

  import scrapy


  class DushuItem(scrapy.Item):
      title = scrapy.Field()
      author = scrapy.Field()
      content = scrapy.Field()
      state = scrapy.Field()

编写爬虫 dsw.py:

  # -*- coding: utf-8 -*-
  import scrapy
  from scrapy.linkextractors import LinkExtractor
  from scrapy.spiders import CrawlSpider, Rule
  from dushu.items import DushuItem

  class DswSpider(CrawlSpider):
      name = 'dsw'
      allowed_domains = ['dushu.com']
      # 改为开始的网址
      start_urls = ['https://www.dushu.com/']


      rules = (
          Rule(LinkExtractor(allow=r'.+/book/.+\.html'), callback='parse_detail', follow=True),
      )

      def parse_detail(self, response):
          # 可查访问的网页，判断是否规范
          # print(response.url)
          book_list = response.xpath('//div[@class="bookslist"]/ul/li')
          for book in book_list:
              try:
                  title = book.xpath('//h3/a/text()').get()
                  author = book.xpath('./div/p[1]/text()').get()
                  content = book.xpath('./div/p[2]/text()').get()
                  # 可购状态
                  state = book.xpath('./div/p[3]/span/text()').get()

                  item = DushuItem(
                      title = title,
                      author = author,
                      content = content,
                      state = state
                  )
                  yield item
              except Exception as result:
                  print(result)
                  continue

编写管道 pipelines.py:

  # -*- coding: utf-8 -*-

  # Define your item pipelines here
  #
  # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

  # 方法一：保存为文本数据
  class DushuTextPipeline(object):

      def __init__(self):
          self.fp = open('books.txt',mode='w',encoding='utf-8')

      def open_spider(self, spider):
          print('爬虫开始了--------------------------------------TEXT--------------------------------------')

      def process_item(self, item, spider):
          self.fp.write(item['title'] + "作者：" + item['author'] + '\n' + item['content'] + '\n' + '状态：' + item['state'] + '\n' + '\n')

          return item

      def close_spider(self, spider):
          self.fp.close()
          print('爬虫结束了-----------------------------------------TEXT-----------------------------------')


  # 方法二：保存为json数据
  from scrapy.exporters import JsonLinesItemExporter

  class DushuJsonPipeline(object):

      def __init__(self):
          self.books_fp = open('books.json',mode='wb')
          self.books_exporter = JsonLinesItemExporter(self.books_fp, ensure_ascii=False)

      def open_spider(self, spider):
          print('爬虫开始了-----------------------------JSON-----------------------------------------------')

      def process_item(self, item, spider):
          self.books_exporter.export_item(item)
          return item

      def close_spider(self, spider):
          self.books_fp.close()
          print('爬虫结束了-------------------------------JSON---------------------------------------------')

运行效果图：在管道中编写了两种储存数据的方法

3. 中间件之selenium

创建项目：

设置中间件 middlewares.py：

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from selenium import webdriver
from scrapy.http.response.html import HtmlResponse

# setting中改为自定义的中间件
class SeleniumDownloadMiddleware(object):

    def __init__(self):
        self.driver = webdriver.Chrome()

    # 截获scrapy发出的request，通过Chrome浏览器发出
    def process_request(self, request, spider):
        self.driver.get(request.url)
        self.driver.implicitly_wait(10)
        source = self.driver.page_source
        # 把网页源代码封装成response对象，返回给爬虫
        # current_url即当前访问的url
        response = HtmlResponse(url=self.driver.current_url, body=source, request=request,encoding='utf-8')
        return response

修改 setting.py:

# -*- coding: utf-8 -*-

# Scrapy settings for dushu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'dushu'

SPIDER_MODULES = ['dushu.spiders']
NEWSPIDER_MODULE = 'dushu.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dushu (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'dushu.middlewares.DushuSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    # 取消原来默认的
   # 'dushu.middlewares.DushuDownloaderMiddleware': 543,
   'dushu.middlewares.SeleniumDownloadMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   # 注释原来的
   # 'dushu.pipelines.DushuPipeline': 300,
   'dushu.pipelines.DushuTextPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

明确目标 items.py:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class DushuItem(scrapy.Item):
    book_class = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    content = scrapy.Field()
    state = scrapy.Field()

编写 dsw.py:

# -*- coding: utf-8 -*-
import scrapy
from dushu.items import DushuItem

class DswSpider(scrapy.Spider):
    name = 'dsw'
    allowed_domains = ['dushu.com']
    # 改开始的网址
    start_urls = ['https://www.dushu.com/']

    def parse(self, response):
        book_list = response.xpath('//div[@class="class-nav"]/a')
        for books in book_list:
            url = books.xpath('./@href').get()
            # 不止一个div[@class="class-nav"]，当遇到/book/结束循环下面的就会被过滤
            if url == '/book/':
                break
            else:
                url = response.urljoin(url)

            yield scrapy.Request(url=url, callback=self.parse_detail)


    def parse_detail(self, response):

        books = response.xpath('.//div[@class="bookslist"]/ul//li')
        # print(books)
        for book in books:
            title = book.xpath('.//h3/a/text()').get()
            # print(title)
            author = book.xpath('./div/p[1]/text()').get()
            # print(author)
            content = book.xpath('./div/p[2]/text()').get()
            # print(content)
            state = book.xpath('./div/p[3]/span/text()').get()

            item = DushuItem()
            item['title'] = title
            item['author'] = author
            item['content'] = content
            item['state'] = state
            yield item

        # 第一页只有一个a[@class="disabled"]， 即下一页，
        # 第二页有两个a[@class="disabled"]，第一个为上一页，第二个为下一页
        next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]')
        if next_page == []:
            next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"]/@href').get()
        else:
            next_page =  response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]/@href').get()
        # print(next_page)
        # print(self.base_url + next_page)
        # 获取下一页的链接，再一次请求，调用函数parse
        yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_detail)

编写管道 pipelines.py:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# 方法一：保存为文本数据
class DushuTextPipeline(object):

    def __init__(self):
        self.fp = open('books.txt',mode='w',encoding='utf-8')

    def open_spider(self, spider):
        print('爬虫开始了--------------------------------------TEXT--------------------------------------')

    def process_item(self, item, spider):
        self.fp.write(item['title'] + "作者：" + item['author'] + '\n' + item['content'] + '\n' + '状态：' + item['state'] + '\n' + '\n')

        return item

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束了-----------------------------------------TEXT-----------------------------------')

运行效果：

posted @ 2023-06-28 23:02 枫_Null 阅读(16) 评论(0) 收藏举报

刷新页面返回顶部

枫_Null

Python爬虫 #021 Scrapy综合案例

1. 传统方法

2. CrawlSpider

3. 中间件之selenium

公告