Scrapy抓取全站数据
一、使用原始方法抓取书籍
1.1 book.py
import scrapy
class BookSpider(scrapy.Spider):
name = "book"
allowed_domains = ["shicimingju.com"]
start_urls = ["https://www.shicimingju.com/book/"]
def parse(self, resp, **kwargs):
# 起始页面的返回
a_list = resp.xpath("//div[@class='card booknark_card']/ul/li/a")
for a in a_list:
href = a.xpath("./@href").extract_first()
text = a.xpath("./text()").extract_first()
href = resp.urljoin(href) # 77777 99999
# print(text, href)
yield scrapy.Request(url=href, callback=self.parse_title)
def parse_title(self, resp):
title = resp.xpath("//h1/text()").extract_first()
a_list = resp.xpath("//div[@class='book-mulu']/ul/li/a")
for a in a_list:
href = a.xpath("./@href").extract_first()
text = a.xpath("./text()").extract_first()
# print(title, text, href)
href = resp.urljoin(href)
yield scrapy.Request(url=href, callback=self.parse_detail, meta={
"book-name": title
})
def parse_detail(self, resp):
book_name = resp.meta['book-name']
title = resp.xpath("//h1/text()").extract_first()
content = "".join(resp.xpath("//div[@class='chapter_content']//text()").extract())
yield {
"book_name": book_name,
"title": title,
"content": content
}
book2.py 链接提取器
import scrapy
# 连接提取器
from scrapy.linkextractors import LinkExtractor
class Book2Spider(scrapy.Spider):
name = "book2"
allowed_domains = ["shicimingju.com"]
start_urls = ["https://www.shicimingju.com/book/"]
def parse(self, resp, **kwargs):
# 提取详情页的url
tiquqi = LinkExtractor(restrict_xpaths=("//div[@class='card booknark_card']/ul/li",))
# 提取页面上的连接
# 直接给响应对象就完了
links = tiquqi.extract_links(resp)
for link in links:
# 拿到连接和文本
# print(link.url, link.text)
yield scrapy.Request(url=link.url, callback=self.parse_title)
def parse_title(self, resp):
title = resp.xpath("//h1/text()").extract_first()
tiquqi = LinkExtractor(restrict_xpaths=("//div[@class='book-mulu']/ul/li",))
links = tiquqi.extract_links(resp)
for link in links:
yield scrapy.Request(url=link.url, callback=self.parse_detail, meta={
"book-name": title
})
def parse_detail(self, resp):
book_name = resp.meta['book-name']
title = resp.xpath("//h1/text()").extract_first()
content = "".join(resp.xpath("//div[@class='chapter_content']//text()").extract())
yield {
"book_name": book_name,
"title": title,
"content": content
}
book3.py 链接提取目标数据
import scrapy
from scrapy.linkextractors import LinkExtractor
class Book3Spider(scrapy.Spider):
name = "book3"
allowed_domains = ["shicimingju.com"]
start_urls = ["https://shicimingju.com"]
def parse(self, resp):
# 不要运行.
# 搜索引擎的逻辑:
html_url = resp.url
html_content = resp.text
html_text = "".join(resp.xpath("//body//text()").extract())
# 提取目标数据
title = resp.xpath("//h1[@id='zs_title']/text()").extract_first()
zuo = resp.xpath("//div[@class='niandai_zuozhe']//text()").extract()
content = resp.xpath("//div[@class='item_content']//text()").extract()
if title and zuo and content:
yield {}
tiquqi = LinkExtractor()
links = tiquqi.extract_links(resp)
for link in links:
link_url = link.url
print("==>",link_url)
yield scrapy.Request(link_url, callback=self.parse)
book4.py CrawlSpider使用
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
# 继承了CrawlSpider, CrawlSpider继承了scrapy.Spider. 咱们这里面还是以前的逻辑
class Book4Spider(CrawlSpider):
name = "book4"
allowed_domains = ["shicimingju.com"]
start_urls = ["https://www.shicimingju.com/cate?cate_id=5"]
# 规则
# start_url响应回来的东西. 会自动的去匹配里面的规则.
# 按照规
"""
def parse(self, resp):
lk = LinkExtractor(allow=r"Items/")
links = lk.extract_links(resp)
for link in links:
yield scrapy.Request(url=link.url, callback=self.parse_detail)
"""
rules = (
# 通过连接提取器, 提取出来的连接, 会自动的发送请求出去.
# 连接返回后, 回调函数是 callback 对应的方法名称
# 相当于帮你写了一层parse
# 自动拿到详情页的连接. 并发送请求
Rule(LinkExtractor(restrict_xpaths=("//div[@class='shici_list_main']",)), callback="parse_detail"),
)
def parse_detail(self, response):
print("哈哈")
# 详情页的内容解析
shangxi_content = response.xpath("//div[@class='shangxi_content']/text()").extract()
print(shangxi_content)
pipelines.py
from itemadapter import ItemAdapter
import os
class ShuPipeline:
def process_item(self, item, spider):
# print(item)
book_name = item['book_name'].replace("《", "").replace("》", "")
title = item['title'].replace("·", "").replace(" ", "").replace("\n", "")
content = item['content'].replace("·", "").replace(" ", "").replace("\n", "").replace("\xa0", "")
save_dir = "书籍内容/"+book_name
if not os.path.exists(save_dir):
os.makedirs(save_dir) # 创建文件夹
f = open(save_dir + "/" + title, mode="w", encoding="utf-8")
f.write(content)
f.close()
print("文件保存成功", book_name, title)
return item
settings.py
# Scrapy settings for shu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "shu"
SPIDER_MODULES = ["shu.spiders"]
NEWSPIDER_MODULE = "shu.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "shu (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "no-cache",
# cookie里面的值, 通过观察, 看到都是百度的东西. 不用管
"cookie": "Hm_lvt_649f268280b553df1f778477ee743752=1725014889,1725623759; HMACCOUNT=0426BA43E4FD28B1; Hm_lpvt_649f268280b553df1f778477ee743752=1725624603",
"dnt": "1",
"pragma": "no-cache",
"priority": "u=0, i",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "shu.middlewares.ShuSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "shu.middlewares.ShuDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"shu.pipelines.ShuPipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
runner.py
from scrapy.cmdline import execute
if __name__ == '__main__':
execute("scrapy crawl book4".split())
二、抓取汽车数据
1.使用常规Spider
注意, 访问频率要控制一下. 要不然会跳验证的.
settings.py
DOWNLOAD_DELAY = 3
che168.py
class ErshouSpider(scrapy.Spider):
name = 'ershou'
allowed_domains = ['che168.com']
start_urls = ['https://www.che168.com/china/a0_0msdgscncgpi1ltocsp1exx0/']
def parse(self, resp, **kwargs):
# print(resp.text)
# 链接提取器
le = LinkExtractor(restrict_xpaths=("//ul[@class='viewlist_ul']/li/a",), deny_domains=("topicm.che168.com",) )
links = le.extract_links(resp)
for link in links:
yield scrapy.Request(
url=link.url,
callback=self.parse_detail
)
# 翻页功能
le2 = LinkExtractor(restrict_xpaths=("//div[@id='listpagination']/a",))
pages = le2.extract_links(resp)
for page in pages:
yield scrapy.Request(url=page.url, callback=self.parse)
def parse_detail(self, resp, **kwargs):
title = resp.xpath('/html/body/div[5]/div[2]/h3/text()').extract_first()
print(title)
1.2链接提取器
LinkExtractor: 链接提取器. 可以非常方便的帮助我们从一个响应页面中提取到url链接. 我们只需要提前定义好规则即可.
参数:
allow, 接收一堆正则表达式, 可以提取出符合该正则的链接
deny, 接收一堆正则表达式, 可以剔除符合该正则的链接
allow_domains: 接收一堆域名, 符合里面的域名的链接被提取
deny_domains: 接收一堆域名, 剔除不符合该域名的链接
restrict_xpaths: 接收一堆xpath, 可以提取符合要求xpath的链接
restrict_css: 接收一堆css选择器, 可以提取符合要求的css选择器的链接
tags: 接收一堆标签名, 从某个标签中提取链接, 默认a, area
attrs: 接收一堆属性名, 从某个属性中提取链接, 默认href
值得注意的, ==在提取到的url中, 是有重复的内容的. 但是我们不用管. scrapy会自动帮我们过滤掉重复的url请求.==
三. 使用CrawlSpider
在scrapy中提供了CrawlSpider来完成全站数据抓取.
1.创建项目
scrapy startproject qichezhijia
2.进入项目
cd qichezhijia
3.创建爬虫(CrawlSpider)
scrapy genspider -t crawl ershouche che168.com
che168.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class Che168Spider(CrawlSpider):
name = "che168"
allowed_domains = ["che168.com"]
start_urls = ["https://www.che168.com/china/list/#pvareaid=105575"]
# 页面上可能会出现的所有标签. 全部记录在案, 做关系映射
temp = {
"表显里程": "li_cheng",
"上牌时间": "start_time",
"挡位/排量": "pai_liang",
"车辆所在地": "location",
"查看限迁地": "guo_biao",
}
rules = (
# 一步到位, 直接干到各个详情页
# 干掉广告的域名
Rule(LinkExtractor(restrict_xpaths=("//ul[@class='viewlist_ul']/li",), deny_domains=("topicm.che168.com", )), callback="parse_item"),
# 分页的连接处理
# 提取出来的所有连接. , callback=self.parse
# follow, 当前提取器, 提取到的连接, 发送请求之后. 回来了. 提取新的连接. 是否自动的适配所有的rules
Rule(LinkExtractor(restrict_xpaths=("//div[@id='listpagination']",)), follow=True)
)
def parse_item(self, resp):
print(resp.url)
"""
负责, 详情页的解析
"""
name = resp.xpath("//h3[@class='car-brand-name']/text()").extract_first()
li_list = resp.xpath("//ul[@class='brand-unit-item fn-clear']/li")
# 为了保证数据格式的完整. 提前初始化
item = {
"li_cheng": "未知",
"start_time": "未知",
"pai_liang": "未知",
"location": "未知",
"guo_biao": "未知",
}
for li in li_list:
sm_title = "".join(li.xpath("./p//text()").extract()).replace(" ", "") # 上牌时间, 挡位 / 排量
sm_value = li.xpath("./h4/text()").extract_first().replace(" ", "")
key = Che168Spider.temp.get(sm_title) # 动态的进行匹配.
item[key] = sm_value
print(item)
# # 找到有问题的那一个
# if item['li_cheng'] =='未知':
# print(resp.url)
settings.py
# Scrapy settings for che project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "che"
SPIDER_MODULES = ["che.spiders"]
NEWSPIDER_MODULE = "che.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "che (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# 访问速度太快. 容易出滑块. 这里直接降低访问频率
# 访问的时间间隔
# 如果出滑块了. 临时应对的办法: 用浏览器打开.页面, 手工过掉滑块, 重新复制一份cookie. 可以临时使用一下
# 不影响咱们学习scrapy
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# 为了下面的cookie能生效
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-encoding": "gzip, deflate, br, zstd",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"cache-control": "no-cache",
"cookie": "fvlid=1716458604075AU1zWGd5kDp9; sessionid=20f5e708-1a28-4787-ad6e-4236c4db2118; che_sessionid=58CF9E8D-9877-4F42-8A12-6AE0D52F95FD%7C%7C2024-05-23+18%3A03%3A24.561%7C%7Cwww.autohome.com.cn; UsedCarBrowseHistory=0%3A47354093; href=https%3A%2F%2Fwww.che168.com%2F; accessId=7a783820-ec84-11ec-b95f-79694d4df285; pageViewNum=1; area=119999; Hm_lvt_d381ec2f88158113b9b76f14c497ed48=1724676865,1725015038,1725621637; HMACCOUNT=0426BA43E4FD28B1; userarea=0; listuserarea=0; sessionip=123.125.53.102; ahpvno=7; Hm_lpvt_d381ec2f88158113b9b76f14c497ed48=1725630775; ahuuid=3016D6BE-E855-42E3-B9DA-C5C59288E270; sessionvisit=5f712ac5-36ed-4af9-b2d5-b629fbac123a; sessionvisitInfo=20f5e708-1a28-4787-ad6e-4236c4db2118|www.che168.com|105575; v_no=17; visit_info_ad=58CF9E8D-9877-4F42-8A12-6AE0D52F95FD||770DE0CE-30B8-4AEC-A464-94C767498A4C||-1||-1||17; che_ref=www.autohome.com.cn%7C0%7C100533%7C0%7C2024-09-06+21%3A52%3A55.600%7C2024-05-23+18%3A03%3A24.561; che_sessionvid=770DE0CE-30B8-4AEC-A464-94C767498A4C; showNum=16; sessionuid=20f5e708-1a28-4787-ad6e-4236c4db2118",
"dnt": "1",
"pragma": "no-cache",
"priority": "u=0, i",
"referer": "https://www.che168.com/",
"sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "che.middlewares.CheSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "che.middlewares.CheDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# "che.pipelines.ChePipeline": 300,
#}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
CrawlSpider的工作流程.
前期和普通的spider是一致的. 在第一次请求回来之后. 会自动的将返回的response按照rules中订制的规则来提取链接. 并进一步执行callback中的回调. 如果follow是True, 则继续在响应的内容中继续使用该规则提取链接. 相当于在parse中的scrapy.request(xxx, callback=self.parse)

浙公网安备 33010602011771号