Scrapy抓取全站数据

一、使用原始方法抓取书籍

1.1 book.py

import scrapy


class BookSpider(scrapy.Spider):
    name = "book"
    allowed_domains = ["shicimingju.com"]
    start_urls = ["https://www.shicimingju.com/book/"]

    def parse(self, resp, **kwargs):
        # 起始页面的返回
        a_list = resp.xpath("//div[@class='card booknark_card']/ul/li/a")
        for a in a_list:
            href = a.xpath("./@href").extract_first()
            text = a.xpath("./text()").extract_first()

            href = resp.urljoin(href)  # 77777 99999
            # print(text, href)
            yield scrapy.Request(url=href, callback=self.parse_title)

    def parse_title(self, resp):
        title = resp.xpath("//h1/text()").extract_first()
        a_list = resp.xpath("//div[@class='book-mulu']/ul/li/a")
        for a in a_list:
            href = a.xpath("./@href").extract_first()
            text = a.xpath("./text()").extract_first()
            # print(title, text, href)
            href = resp.urljoin(href)

            yield scrapy.Request(url=href, callback=self.parse_detail, meta={
                "book-name": title
            })

    def parse_detail(self, resp):
        book_name = resp.meta['book-name']
        title = resp.xpath("//h1/text()").extract_first()
        content = "".join(resp.xpath("//div[@class='chapter_content']//text()").extract())
        yield {
            "book_name": book_name,
            "title": title,
            "content": content
        }

book2.py 链接提取器

import scrapy
# 连接提取器
from scrapy.linkextractors import LinkExtractor


class Book2Spider(scrapy.Spider):
    name = "book2"
    allowed_domains = ["shicimingju.com"]
    start_urls = ["https://www.shicimingju.com/book/"]

    def parse(self, resp, **kwargs):
        # 提取详情页的url
        tiquqi = LinkExtractor(restrict_xpaths=("//div[@class='card booknark_card']/ul/li",))
        # 提取页面上的连接
        # 直接给响应对象就完了
        links = tiquqi.extract_links(resp)
        for link in links:
            # 拿到连接和文本
            # print(link.url, link.text)
            yield scrapy.Request(url=link.url, callback=self.parse_title)

    def parse_title(self, resp):
        title = resp.xpath("//h1/text()").extract_first()
        tiquqi = LinkExtractor(restrict_xpaths=("//div[@class='book-mulu']/ul/li",))
        links = tiquqi.extract_links(resp)
        for link in links:
            yield scrapy.Request(url=link.url, callback=self.parse_detail, meta={
                "book-name": title
            })

    def parse_detail(self, resp):
        book_name = resp.meta['book-name']
        title = resp.xpath("//h1/text()").extract_first()
        content = "".join(resp.xpath("//div[@class='chapter_content']//text()").extract())
        yield {
            "book_name": book_name,
            "title": title,
            "content": content
        }

book3.py 链接提取目标数据

import scrapy
from scrapy.linkextractors import LinkExtractor


class Book3Spider(scrapy.Spider):
    name = "book3"
    allowed_domains = ["shicimingju.com"]
    start_urls = ["https://shicimingju.com"]

    def parse(self, resp):
        # 不要运行.
        # 搜索引擎的逻辑:
        html_url = resp.url
        html_content = resp.text
        html_text = "".join(resp.xpath("//body//text()").extract())

        # 提取目标数据
        title = resp.xpath("//h1[@id='zs_title']/text()").extract_first()
        zuo = resp.xpath("//div[@class='niandai_zuozhe']//text()").extract()
        content = resp.xpath("//div[@class='item_content']//text()").extract()
        if title and zuo and content:
            yield {}

        tiquqi = LinkExtractor()
        links = tiquqi.extract_links(resp)
        for link in links:
            link_url = link.url
            print("==>",link_url)
            yield scrapy.Request(link_url, callback=self.parse)

book4.py CrawlSpider使用

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


# 继承了CrawlSpider, CrawlSpider继承了scrapy.Spider. 咱们这里面还是以前的逻辑
class Book4Spider(CrawlSpider):
    name = "book4"
    allowed_domains = ["shicimingju.com"]
    start_urls = ["https://www.shicimingju.com/cate?cate_id=5"]

    # 规则
    # start_url响应回来的东西. 会自动的去匹配里面的规则.
    # 按照规

    """
    def parse(self, resp):
        lk = LinkExtractor(allow=r"Items/")
        links = lk.extract_links(resp)
        for link in links:
            yield scrapy.Request(url=link.url, callback=self.parse_detail)
    """

    rules = (
        # 通过连接提取器, 提取出来的连接, 会自动的发送请求出去.
        # 连接返回后, 回调函数是 callback 对应的方法名称
        # 相当于帮你写了一层parse
        # 自动拿到详情页的连接. 并发送请求
        Rule(LinkExtractor(restrict_xpaths=("//div[@class='shici_list_main']",)), callback="parse_detail"),
    )

    def parse_detail(self, response):
        print("哈哈")
        # 详情页的内容解析
        shangxi_content = response.xpath("//div[@class='shangxi_content']/text()").extract()
        print(shangxi_content)

pipelines.py

from itemadapter import ItemAdapter
import os


class ShuPipeline:
    def process_item(self, item, spider):
        # print(item)
        book_name = item['book_name'].replace("《", "").replace("》", "")
        title = item['title'].replace("·", "").replace(" ", "").replace("\n", "")
        content = item['content'].replace("·", "").replace(" ", "").replace("\n", "").replace("\xa0", "")

        save_dir = "书籍内容/"+book_name
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)  # 创建文件夹

        f = open(save_dir + "/" + title, mode="w", encoding="utf-8")
        f.write(content)
        f.close()
        print("文件保存成功", book_name, title)
        return item

settings.py

# Scrapy settings for shu project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "shu"

SPIDER_MODULES = ["shu.spiders"]
NEWSPIDER_MODULE = "shu.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "shu (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "cache-control": "no-cache",
    # cookie里面的值, 通过观察, 看到都是百度的东西. 不用管
    "cookie": "Hm_lvt_649f268280b553df1f778477ee743752=1725014889,1725623759; HMACCOUNT=0426BA43E4FD28B1; Hm_lpvt_649f268280b553df1f778477ee743752=1725624603",
    "dnt": "1",
    "pragma": "no-cache",
    "priority": "u=0, i",
    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "none",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "shu.middlewares.ShuSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "shu.middlewares.ShuDownloaderMiddleware": 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   "shu.pipelines.ShuPipeline": 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

runner.py

from scrapy.cmdline import execute

if __name__ == '__main__':
    execute("scrapy crawl book4".split())

二、抓取汽车数据

1.使用常规Spider

注意, 访问频率要控制一下. 要不然会跳验证的.
settings.py

DOWNLOAD_DELAY = 3

che168.py

class ErshouSpider(scrapy.Spider):
    name = 'ershou'
    allowed_domains = ['che168.com']
    start_urls = ['https://www.che168.com/china/a0_0msdgscncgpi1ltocsp1exx0/']

    def parse(self, resp, **kwargs):
        # print(resp.text)
        # 链接提取器
        le = LinkExtractor(restrict_xpaths=("//ul[@class='viewlist_ul']/li/a",), deny_domains=("topicm.che168.com",) )
        links = le.extract_links(resp)
        for link in links:
            yield scrapy.Request(
                url=link.url,
                callback=self.parse_detail
            )
        # 翻页功能
        le2 = LinkExtractor(restrict_xpaths=("//div[@id='listpagination']/a",))
        pages = le2.extract_links(resp)
        for page in pages:
            yield scrapy.Request(url=page.url, callback=self.parse)

    def parse_detail(self, resp, **kwargs):
        title = resp.xpath('/html/body/div[5]/div[2]/h3/text()').extract_first()
        print(title)

1.2链接提取器

LinkExtractor: 链接提取器. 可以非常方便的帮助我们从一个响应页面中提取到url链接. 我们只需要提前定义好规则即可. 

参数: 

​	allow, 接收一堆正则表达式, 可以提取出符合该正则的链接
​	deny, 接收一堆正则表达式, 可以剔除符合该正则的链接
​	allow_domains: 接收一堆域名, 符合里面的域名的链接被提取
​	deny_domains: 接收一堆域名, 剔除不符合该域名的链接
​	restrict_xpaths: 接收一堆xpath, 可以提取符合要求xpath的链接
​	restrict_css: 接收一堆css选择器, 可以提取符合要求的css选择器的链接
​	tags: 接收一堆标签名, 从某个标签中提取链接, 默认a, area
​	attrs: 接收一堆属性名, 从某个属性中提取链接, 默认href

值得注意的, ==在提取到的url中, 是有重复的内容的. 但是我们不用管. scrapy会自动帮我们过滤掉重复的url请求.==

三. 使用CrawlSpider

在scrapy中提供了CrawlSpider来完成全站数据抓取.

1.创建项目

scrapy startproject qichezhijia

2.进入项目

cd qichezhijia

3.创建爬虫(CrawlSpider)

scrapy genspider -t crawl ershouche che168.com

che168.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class Che168Spider(CrawlSpider):
    name = "che168"
    allowed_domains = ["che168.com"]
    start_urls = ["https://www.che168.com/china/list/#pvareaid=105575"]

    # 页面上可能会出现的所有标签. 全部记录在案, 做关系映射
    temp = {
        "表显里程": "li_cheng",
        "上牌时间": "start_time",
        "挡位/排量": "pai_liang",
        "车辆所在地": "location",
        "查看限迁地": "guo_biao",
    }

    rules = (
        # 一步到位, 直接干到各个详情页
        #                                                                          干掉广告的域名
        Rule(LinkExtractor(restrict_xpaths=("//ul[@class='viewlist_ul']/li",), deny_domains=("topicm.che168.com", )), callback="parse_item"),
        # 分页的连接处理
        # 提取出来的所有连接. ,               callback=self.parse
        # follow, 当前提取器, 提取到的连接, 发送请求之后. 回来了. 提取新的连接. 是否自动的适配所有的rules
        Rule(LinkExtractor(restrict_xpaths=("//div[@id='listpagination']",)), follow=True)
    )

    def parse_item(self, resp):
        print(resp.url)
        """
        负责, 详情页的解析
        """
        name = resp.xpath("//h3[@class='car-brand-name']/text()").extract_first()
        li_list = resp.xpath("//ul[@class='brand-unit-item fn-clear']/li")

        # 为了保证数据格式的完整. 提前初始化
        item = {
            "li_cheng": "未知",
            "start_time": "未知",
            "pai_liang": "未知",
            "location": "未知",
            "guo_biao": "未知",
        }

        for li in li_list:
            sm_title = "".join(li.xpath("./p//text()").extract()).replace(" ", "")  # 上牌时间, 挡位 / 排量
            sm_value = li.xpath("./h4/text()").extract_first().replace(" ", "")

            key = Che168Spider.temp.get(sm_title)  # 动态的进行匹配.
            item[key] = sm_value

        print(item)
        # # 找到有问题的那一个
        # if item['li_cheng'] =='未知':
        #     print(resp.url)

settings.py

# Scrapy settings for che project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "che"

SPIDER_MODULES = ["che.spiders"]
NEWSPIDER_MODULE = "che.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "che (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = "WARNING"

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

# 访问速度太快. 容易出滑块. 这里直接降低访问频率
# 访问的时间间隔
# 如果出滑块了. 临时应对的办法: 用浏览器打开.页面, 手工过掉滑块, 重新复制一份cookie. 可以临时使用一下
# 不影响咱们学习scrapy
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 为了下面的cookie能生效
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-encoding": "gzip, deflate, br, zstd",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
    "cache-control": "no-cache",
    "cookie": "fvlid=1716458604075AU1zWGd5kDp9; sessionid=20f5e708-1a28-4787-ad6e-4236c4db2118; che_sessionid=58CF9E8D-9877-4F42-8A12-6AE0D52F95FD%7C%7C2024-05-23+18%3A03%3A24.561%7C%7Cwww.autohome.com.cn; UsedCarBrowseHistory=0%3A47354093; href=https%3A%2F%2Fwww.che168.com%2F; accessId=7a783820-ec84-11ec-b95f-79694d4df285; pageViewNum=1; area=119999; Hm_lvt_d381ec2f88158113b9b76f14c497ed48=1724676865,1725015038,1725621637; HMACCOUNT=0426BA43E4FD28B1; userarea=0; listuserarea=0; sessionip=123.125.53.102; ahpvno=7; Hm_lpvt_d381ec2f88158113b9b76f14c497ed48=1725630775; ahuuid=3016D6BE-E855-42E3-B9DA-C5C59288E270; sessionvisit=5f712ac5-36ed-4af9-b2d5-b629fbac123a; sessionvisitInfo=20f5e708-1a28-4787-ad6e-4236c4db2118|www.che168.com|105575; v_no=17; visit_info_ad=58CF9E8D-9877-4F42-8A12-6AE0D52F95FD||770DE0CE-30B8-4AEC-A464-94C767498A4C||-1||-1||17; che_ref=www.autohome.com.cn%7C0%7C100533%7C0%7C2024-09-06+21%3A52%3A55.600%7C2024-05-23+18%3A03%3A24.561; che_sessionvid=770DE0CE-30B8-4AEC-A464-94C767498A4C; showNum=16; sessionuid=20f5e708-1a28-4787-ad6e-4236c4db2118",
    "dnt": "1",
    "pragma": "no-cache",
    "priority": "u=0, i",
    "referer": "https://www.che168.com/",
    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "che.middlewares.CheSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "che.middlewares.CheDownloaderMiddleware": 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    "che.pipelines.ChePipeline": 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

CrawlSpider的工作流程.

前期和普通的spider是一致的. 在第一次请求回来之后. 会自动的将返回的response按照rules中订制的规则来提取链接. 并进一步执行callback中的回调. 如果follow是True, 则继续在响应的内容中继续使用该规则提取链接. 相当于在parse中的scrapy.request(xxx, callback=self.parse)

posted @ 2024-12-11 15:52  沈忻凯  阅读(81)  评论(0)    收藏  举报