Python爬虫 #021 Scrapy综合案例
读书网是一个用来练习爬虫非常不错的网址,没有严格的反爬手段,当然我们应该要以学习的目的去练习,而不应对网站的恶意攻击
读书网网址:https://www.dushu.com/
1. 传统方法
-
建立项目:

-
设置中间件
middlewares.py:默认即可 -
修改
setting.py:# -*- coding: utf-8 -*- # Scrapy settings for book project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'book' SPIDER_MODULES = ['book.spiders'] NEWSPIDER_MODULE = 'book.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'book (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.3991.400' } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'book.middlewares.BookSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'book.middlewares.BookDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'book.pipelines.BookPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -
明确目标
item.py:# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class BookItem(scrapy.Item): title = scrapy.Field() author = scrapy.Field() content = scrapy.Field() -
编写爬虫
dushu.py:# -*- coding: utf-8 -*- import scrapy from book.items import BookItem class DushuSpider(scrapy.Spider): name = 'dushu' allowed_domains = ['dushu.com'] start_urls = ['https://www.dushu.com/'] def parse(self, response): book_list = response.xpath('//div[@class="class-nav"]/a') for books in book_list: url = books.xpath('./@href').get() # 不止一个div[@class="class-nav"],当遇到/book/结束循环下面的就会被过滤 if url == '/book/': break else: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_detail) def parse_detail(self, response): books = response.xpath('.//div[@class="bookslist"]/ul//li') # print(books) for book in books: title = book.xpath('.//h3/a/text()').get() print(title) author = book.xpath('./div/p[1]/text()').get() print(author) content = book.xpath('./div/p[2]/text()').get() print(content) item = BookItem() item['title'] = title item['author'] = author item['content'] = content yield item # 第一页只有一个a[@class="disabled"], 即下一页, # 第二页有两个a[@class="disabled"],第一个为上一页,第二个为下一页 next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]') if next_page == []: next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"]/@href').get() else: next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]/@href').get() print(next_page) # 获取下一页的链接,再一次请求,调用函数parse yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_detail) -
编写管道
pipeline.py:# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html class BookPipeline(object): def __init__(self): self.fp = open('books.txt', mode='w', encoding='utf-8') def open_spider(self, spider): print('爬虫开始了----------------------------------------') def process_item(self, item, spider): self.fp.write(item['title'] + ' 作者:' + item['author'] + '\n' + item['content'] + '\n' + '\n') return item def close_spider(self, spider): print('爬虫结束了-----------------------------------------') self.fp.close() -
最终效果,在当前目录中生成books.txt文件:

2. CrawlSpider
-
建立项目:

-
设置中间件
middlewares.py:默认即可 -
修改
setting.py:
# -*- coding: utf-8 -*-
# Scrapy settings for dushu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'dushu'
SPIDER_MODULES = ['dushu.spiders']
NEWSPIDER_MODULE = 'dushu.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'dushu (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'dushu.middlewares.DushuSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'dushu.middlewares.DushuDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'dushu.pipelines.DushuTextPipeline': 100,
'dushu.pipelines.DushuJsonPipeline': 200,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
- 明确目标
items.py:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DushuItem(scrapy.Item):
title = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
state = scrapy.Field()
- 编写爬虫
dsw.py:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dushu.items import DushuItem
class DswSpider(CrawlSpider):
name = 'dsw'
allowed_domains = ['dushu.com']
# 改为开始的网址
start_urls = ['https://www.dushu.com/']
rules = (
Rule(LinkExtractor(allow=r'.+/book/.+\.html'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
# 可查访问的网页,判断是否规范
# print(response.url)
book_list = response.xpath('//div[@class="bookslist"]/ul/li')
for book in book_list:
try:
title = book.xpath('//h3/a/text()').get()
author = book.xpath('./div/p[1]/text()').get()
content = book.xpath('./div/p[2]/text()').get()
# 可购状态
state = book.xpath('./div/p[3]/span/text()').get()
item = DushuItem(
title = title,
author = author,
content = content,
state = state
)
yield item
except Exception as result:
print(result)
continue
- 编写管道
pipelines.py:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# 方法一:保存为文本数据
class DushuTextPipeline(object):
def __init__(self):
self.fp = open('books.txt',mode='w',encoding='utf-8')
def open_spider(self, spider):
print('爬虫开始了--------------------------------------TEXT--------------------------------------')
def process_item(self, item, spider):
self.fp.write(item['title'] + "作者:" + item['author'] + '\n' + item['content'] + '\n' + '状态:' + item['state'] + '\n' + '\n')
return item
def close_spider(self, spider):
self.fp.close()
print('爬虫结束了-----------------------------------------TEXT-----------------------------------')
# 方法二:保存为json数据
from scrapy.exporters import JsonLinesItemExporter
class DushuJsonPipeline(object):
def __init__(self):
self.books_fp = open('books.json',mode='wb')
self.books_exporter = JsonLinesItemExporter(self.books_fp, ensure_ascii=False)
def open_spider(self, spider):
print('爬虫开始了-----------------------------JSON-----------------------------------------------')
def process_item(self, item, spider):
self.books_exporter.export_item(item)
return item
def close_spider(self, spider):
self.books_fp.close()
print('爬虫结束了-------------------------------JSON---------------------------------------------')
-
运行效果图:在管道中编写了两种储存数据的方法

3. 中间件之selenium
-
创建项目:

-
设置中间件
middlewares.py:# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from selenium import webdriver from scrapy.http.response.html import HtmlResponse # setting中改为自定义的中间件 class SeleniumDownloadMiddleware(object): def __init__(self): self.driver = webdriver.Chrome() # 截获scrapy发出的request,通过Chrome浏览器发出 def process_request(self, request, spider): self.driver.get(request.url) self.driver.implicitly_wait(10) source = self.driver.page_source # 把网页源代码封装成response对象,返回给爬虫 # current_url即当前访问的url response = HtmlResponse(url=self.driver.current_url, body=source, request=request,encoding='utf-8') return response -
修改
setting.py:# -*- coding: utf-8 -*- # Scrapy settings for dushu project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'dushu' SPIDER_MODULES = ['dushu.spiders'] NEWSPIDER_MODULE = 'dushu.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'dushu (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400' } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'dushu.middlewares.DushuSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { # 取消原来默认的 # 'dushu.middlewares.DushuDownloaderMiddleware': 543, 'dushu.middlewares.SeleniumDownloadMiddleware': 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 注释原来的 # 'dushu.pipelines.DushuPipeline': 300, 'dushu.pipelines.DushuTextPipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' -
明确目标
items.py:# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class DushuItem(scrapy.Item): book_class = scrapy.Field() title = scrapy.Field() author = scrapy.Field() content = scrapy.Field() state = scrapy.Field() -
编写
dsw.py:# -*- coding: utf-8 -*- import scrapy from dushu.items import DushuItem class DswSpider(scrapy.Spider): name = 'dsw' allowed_domains = ['dushu.com'] # 改开始的网址 start_urls = ['https://www.dushu.com/'] def parse(self, response): book_list = response.xpath('//div[@class="class-nav"]/a') for books in book_list: url = books.xpath('./@href').get() # 不止一个div[@class="class-nav"],当遇到/book/结束循环下面的就会被过滤 if url == '/book/': break else: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_detail) def parse_detail(self, response): books = response.xpath('.//div[@class="bookslist"]/ul//li') # print(books) for book in books: title = book.xpath('.//h3/a/text()').get() # print(title) author = book.xpath('./div/p[1]/text()').get() # print(author) content = book.xpath('./div/p[2]/text()').get() # print(content) state = book.xpath('./div/p[3]/span/text()').get() item = DushuItem() item['title'] = title item['author'] = author item['content'] = content item['state'] = state yield item # 第一页只有一个a[@class="disabled"], 即下一页, # 第二页有两个a[@class="disabled"],第一个为上一页,第二个为下一页 next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]') if next_page == []: next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"]/@href').get() else: next_page = response.xpath('.//div[@class="pages"]/a[@class="disabled"][2]/@href').get() # print(next_page) # print(self.base_url + next_page) # 获取下一页的链接,再一次请求,调用函数parse yield scrapy.Request(url=response.urljoin(next_page), callback=self.parse_detail) -
编写管道
pipelines.py:# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # 方法一:保存为文本数据 class DushuTextPipeline(object): def __init__(self): self.fp = open('books.txt',mode='w',encoding='utf-8') def open_spider(self, spider): print('爬虫开始了--------------------------------------TEXT--------------------------------------') def process_item(self, item, spider): self.fp.write(item['title'] + "作者:" + item['author'] + '\n' + item['content'] + '\n' + '状态:' + item['state'] + '\n' + '\n') return item def close_spider(self, spider): self.fp.close() print('爬虫结束了-----------------------------------------TEXT-----------------------------------') -
运行效果:

本文来自博客园,作者:{枫_Null},转载请注明原文链接:https://www.cnblogs.com/fengNull/articles/16663567.html

浙公网安备 33010602011771号