Transformers-源码解析-一百零七-

Transformers 源码解析（一百零七）

`.\models\stablelm\init.py`

# 版权声明及许可信息
# Copyright 2024 Stability AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入 TYPE_CHECKING 模块
from typing import TYPE_CHECKING

# 导入 OptionalDependencyNotAvailable 异常和 _LazyModule 类，以及 is_torch_available 函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_stablelm": ["STABLELM_PRETRAINED_CONFIG_ARCHIVE_MAP", "StableLmConfig"],
}

# 检查是否导入了 torch 库，如果未导入，则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果导入了 torch 库，则添加以下模块到导入结构中
    _import_structure["modeling_stablelm"] = [
        "StableLmForCausalLM",
        "StableLmModel",
        "StableLmPreTrainedModel",
        "StableLmForSequenceClassification",
    ]

# 如果当前环境是类型检查环境
if TYPE_CHECKING:
    # 导入 configuration_stablelm 模块中的特定内容
    from .configuration_stablelm import STABLELM_PRETRAINED_CONFIG_ARCHIVE_MAP, StableLmConfig

    # 再次检查是否导入了 torch 库，如果未导入，则继续抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果导入了 torch 库，则导入 modeling_stablelm 模块中的特定内容
        from .modeling_stablelm import (
            StableLmForCausalLM,
            StableLmForSequenceClassification,
            StableLmModel,
            StableLmPreTrainedModel,
        )

# 如果当前不是类型检查环境
else:
    import sys

    # 将当前模块替换为 _LazyModule 实例，支持懒加载模块的功能
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\starcoder2\configuration_starcoder2.py`

# 定义 Starcoder2Config 类，用于存储 Starcoder2 模型的配置信息
class Starcoder2Config(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
    Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    ```
    >>> from transformers import Starcoder2Model, Starcoder2Config

    >>> # Initializing a Starcoder2 7B style configuration
    >>> configuration = Starcoder2Config()

    >>> # Initializing a model from the Starcoder2 7B style configuration
    >>> model = Starcoder2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```

    model_type = "starcoder2"  # 指定模型类型为 "starcoder2"
    keys_to_ignore_at_inference = ["past_key_values"]  # 推理过程中需要忽略的键列表

    def __init__(
        self,
        vocab_size=49152,  # 词汇表大小，默认为 49152
        hidden_size=3072,  # 隐藏层大小，默认为 3072
        intermediate_size=12288,  # 中间层大小，默认为 12288
        num_hidden_layers=30,  # 隐藏层层数，默认为 30
        num_attention_heads=24,  # 注意力头数，默认为 24
        num_key_value_heads=2,  # 键值头数，默认为 2
        hidden_act="gelu_pytorch_tanh",  # 隐藏层激活函数，默认为 "gelu_pytorch_tanh"
        max_position_embeddings=4096,  # 最大位置嵌入数，默认为 4096
        initializer_range=0.018042,  # 初始化范围，默认为 0.018042
        norm_epsilon=1e-5,  # 归一化过程中的 epsilon 值，默认为 1e-5
        use_cache=True,  # 是否使用缓存，默认为 True
        bos_token_id=50256,  # 开始标记的 token ID，默认为 50256
        eos_token_id=50256,  # 结束标记的 token ID，默认为 50256
        rope_theta=10000.0,  # 绳索 theta 参数，默认为 10000.0
        sliding_window=None,  # 滑动窗口大小，默认为 None
        attention_dropout=0.0,  # 注意力部分的 dropout 率，默认为 0.0
        residual_dropout=0.0,  # 残差连接的 dropout 率，默认为 0.0
        embedding_dropout=0.0,  # 嵌入层的 dropout 率，默认为 0.0
        use_bias=True,  # 是否使用偏置，默认为 True
        **kwargs,  # 其他参数
    ):
        # 调用父类 PretrainedConfig 的构造函数，初始化配置信息
        super().__init__(**kwargs)
        # 初始化Transformer模型的各种超参数
        self.vocab_size = vocab_size  # 词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 最大位置编码数
        self.hidden_size = hidden_size  # 隐藏层大小
        self.intermediate_size = intermediate_size  # 中间层大小
        self.num_hidden_layers = num_hidden_layers  # 隐藏层层数
        self.num_attention_heads = num_attention_heads  # 注意力头数
        self.sliding_window = sliding_window  # 滑动窗口大小
        self.use_bias = use_bias  # 是否使用偏置项
        self.num_key_value_heads = num_key_value_heads  # 键值头数
        self.hidden_act = hidden_act  # 隐藏层激活函数
        self.initializer_range = initializer_range  # 初始化范围
        self.norm_epsilon = norm_epsilon  # 归一化操作中的epsilon值
        self.use_cache = use_cache  # 是否使用缓存
        self.rope_theta = rope_theta  # ROPE模型的theta参数
        self.attention_dropout = attention_dropout  # 注意力机制的dropout率
        self.residual_dropout = residual_dropout  # 残差连接的dropout率
        self.embedding_dropout = embedding_dropout  # 嵌入层的dropout率

        # 调用父类构造函数，初始化Transformer模型的基本设置
        super().__init__(
            bos_token_id=bos_token_id,  # 开始标记的ID
            eos_token_id=eos_token_id,  # 结束标记的ID
            **kwargs,  # 其他可能的参数
        )

`.\models\starcoder2\modeling_starcoder2.py`

# 设置文件编码为 UTF-8
# 版权声明和版权信息，说明本代码基于 EleutherAI 的 GPT-NeoX 库，已经进行了修改以适应 Meta AI 团队训练的模型的架构差异
# 依照 Apache License, Version 2.0 授权许可，除非符合许可协议，否则不得使用本文件
# 获取许可协议的副本，请访问 http://www.apache.org/licenses/LICENSE-2.0
#
# 该代码定义了 PyTorch Starcoder2 模型

import inspect  # 导入 inspect 模块用于获取对象信息
import math  # 导入 math 模块提供的数学函数
import warnings  # 导入 warnings 模块用于处理警告
from typing import List, Optional, Tuple, Union  # 导入类型提示相关的模块

import torch  # 导入 PyTorch
import torch.nn.functional as F  # 导入 PyTorch 中的函数模块
import torch.utils.checkpoint  # 导入 PyTorch 中用于实现checkpoint的模块
from torch import nn  # 导入 PyTorch 中的神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 导入 PyTorch 中的损失函数

from ...activations import ACT2FN  # 导入激活函数 ACT2FN
from ...cache_utils import Cache, DynamicCache  # 导入缓存工具类
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa  # 导入处理注意力掩码的函数
from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast  # 导入模型输出类
from ...modeling_utils import PreTrainedModel  # 导入预训练模型类
from ...utils import (  # 导入工具函数
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
)
from .configuration_starcoder2 import Starcoder2Config  # 导入 Stacoder2 的配置类


if is_flash_attn_2_available():
    # 如果可用 flash attention 2，则导入相关函数和模块
    from flash_attn import flash_attn_func, flash_attn_varlen_func
    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa

    # 检查 flash attention 是否支持窗口大小参数
    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 文档中的配置信息
_CONFIG_FOR_DOC = "Starcoder2Config"


# 从 transformers.models.llama.modeling_llama._get_unpad_data 复制的函数
# 根据 attention_mask 获取非填充数据
def _get_unpad_data(attention_mask):
    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)  # 计算批次中每个序列的长度总和
    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()  # 找出非填充位置的索引
    max_seqlen_in_batch = seqlens_in_batch.max().item()  # 获取批次中最长的序列长度
    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))  # 计算序列长度的累积和
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


# 从 transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding 复制的类
# 定义 Starcoder2 的旋转嵌入类
class Starcoder2RotaryEmbedding(nn.Module):
    # 初始化函数，用于初始化一个位置编码器对象
    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
        # 调用父类的初始化方法
        super().__init__()

        # 设置对象的维度、最大位置嵌入数量和基数
        self.dim = dim
        self.max_position_embeddings = max_position_embeddings
        self.base = base

        # 计算位置编码中的频率倒数，使用设备上的浮点运算
        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
        
        # 将频率倒数作为缓冲区注册到对象中，不持久化
        self.register_buffer("inv_freq", inv_freq, persistent=False)

        # 为了使 `torch.jit.trace` 正常工作，在这里构建余弦和正弦缓存
        self._set_cos_sin_cache(
            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
        )

    # 设置余弦和正弦缓存的函数
    def _set_cos_sin_cache(self, seq_len, device, dtype):
        # 设置缓存的最大序列长度
        self.max_seq_len_cached = seq_len

        # 生成一个从0到最大序列长度的张量，使用与频率倒数相同的设备和数据类型
        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)

        # 计算频率矩阵
        freqs = torch.outer(t, self.inv_freq)
        
        # 按照论文的描述，拼接余弦和正弦的矩阵，以在位置编码中使用
        emb = torch.cat((freqs, freqs), dim=-1)

        # 将余弦和正弦矩阵注册为对象的缓冲区，使用指定的数据类型
        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)

    # 前向传播函数，接受输入张量 x 和可选的序列长度 seq_len
    def forward(self, x, seq_len=None):
        # x: [bs, num_attention_heads, seq_len, head_size]

        # 如果传入的序列长度大于当前缓存的最大序列长度，则重新设置余弦和正弦缓存
        if seq_len > self.max_seq_len_cached:
            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

        # 返回当前缓存中的余弦和正弦值，截取到指定的序列长度，使用输入张量的数据类型
        return (
            self.cos_cached[:seq_len].to(dtype=x.dtype),
            self.sin_cached[:seq_len].to(dtype=x.dtype),
        )
# 从transformers.models.llama.modeling_llama.rotate_half复制而来
def rotate_half(x):
    """Rotates half the hidden dims of the input."""
    # 取输入张量的后一半维度内容作为x1
    x1 = x[..., : x.shape[-1] // 2]
    # 取输入张量的前一半维度内容作为x2
    x2 = x[..., x.shape[-1] // 2 :]
    # 返回将输入张量的后一半维度内容反向排列、加上前一半维度内容的张量拼接结果
    return torch.cat((-x2, x1), dim=-1)


# 从transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb复制而来
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    # 根据位置ID从cos和sin中取出对应的值，并在指定维度上进行unsqueeze操作
    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
    # 计算经过旋转位置嵌入后的查询和键张量
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


class Starcoder2MLP(nn.Module):
    def __init__(self, config: Starcoder2Config):
        super().__init__()
        embed_dim = config.hidden_size
        # 定义线性变换层c_fc，输入维度为embed_dim，输出维度为config.intermediate_size
        self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias)
        # 定义线性变换层c_proj，输入维度为config.intermediate_size，输出维度为embed_dim
        self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias)
        # 激活函数，根据配置选择不同的激活函数
        self.act = ACT2FN[config.hidden_act]
        # 残差连接中的dropout概率
        self.residual_dropout = config.residual_dropout

    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
        # 线性变换c_fc
        hidden_states = self.c_fc(hidden_states)
        # 应用激活函数
        hidden_states = self.act(hidden_states)
        # 线性变换c_proj
        hidden_states = self.c_proj(hidden_states)
        # 应用dropout操作
        hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training)
        return hidden_states


# 从transformers.models.llama.modeling_llama.repeat_kv复制而来
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    # 略
    """
    This function replicates the behavior of torch.repeat_interleave(x, dim=1, repeats=n_rep).
    It transforms hidden states from shape (batch, num_key_value_heads, seqlen, head_dim)
    to (batch, num_attention_heads, seqlen, head_dim) by repeating along the specified dimension.
    """
    # 获取输入张量的形状参数
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    # 如果重复次数为1，直接返回原始隐藏状态张量
    if n_rep == 1:
        return hidden_states
    # 在第二个维度上添加一个新维度，并在该维度上扩展以复制隐藏状态
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    # 将张量重新形状为所需的形状：(batch, num_attention_heads, seqlen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
# 定义了一个名为 Starcoder2Attention 的 PyTorch 模型类，用于实现多头注意力机制。
# 该类继承自 nn.Module 类。
class Starcoder2Attention(nn.Module):
    """
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".
    """

    # 初始化方法，接受 Starcoder2Config 类型的配置参数和一个可选的层索引参数 layer_idx
    def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None):
        super().__init__()
        self.config = config  # 存储传入的配置对象
        self.layer_idx = layer_idx  # 存储传入的层索引，可选参数

        # 如果未提供层索引，发出警告
        if layer_idx is None:
            logger.warning_once(
                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )

        # 初始化模型需要的各种参数和属性
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.hidden_size // self.num_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
        self.max_position_embeddings = config.max_position_embeddings
        self.rope_theta = config.rope_theta
        self.use_bias = config.use_bias
        self.is_causal = True  # 固定为 True
        self.attention_dropout = config.attention_dropout
        self.residual_dropout = config.residual_dropout

        # 检查 hidden_size 是否可以被 num_heads 整除，否则抛出 ValueError
        if (self.head_dim * self.num_heads) != self.hidden_size:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
                f" and `num_heads`: {self.num_heads})."
            )

        # 定义线性变换层，用于生成查询、键、值以及输出
        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=self.use_bias)
        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.use_bias)
        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=self.use_bias)
        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=self.use_bias)

        # 初始化旋转嵌入层，用于增强注意力机制的表达能力
        self.rotary_emb = Starcoder2RotaryEmbedding(
            self.head_dim,
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )

    # 前向传播方法，接受输入的 hidden_states 和一些可选的参数，返回经过注意力机制处理后的输出
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
        **kwargs,
# 从 transformers.models.mistral.modeling_mistral.MistralFlashAttention2 复制并修改为 Starcoder2
class Starcoder2FlashAttention2(Starcoder2Attention):
    """
    Starcoder2 flash attention module. This module inherits from `Starcoder2Attention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    """
    # flash attention and deal with padding tokens in case the input contains any of them.
    """
        flash attention and deal with padding tokens in case the input contains any of them.
        """
    
        # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
        # 继承父类构造函数，初始化 FlashAttention2 对象
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
    
            # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
            # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
            # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
            # 根据 Flash Attention 的版本设置是否使用顶部左对齐的掩码，影响注意力计算中的掩码生成
            self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
    
        # Ignore copy
        # 执行前向传播
        def forward(
            self,
            hidden_states: torch.Tensor,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.LongTensor] = None,
            past_key_value: Optional[Cache] = None,
            output_attentions: bool = False,
            use_cache: bool = False,
            **kwargs,
        # 使用 Flash Attention 进行前向传播
        def _flash_attention_forward(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            query_length,
            dropout=0.0,
            softmax_scale=None,
            use_sliding_windows=False,
    # 定义一个方法 _upad_input，接受多个输入参数：query_layer, key_layer, value_layer, attention_mask, query_length
    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
        # 从 key_layer 的形状中获取 batch_size, kv_seq_len, num_heads, head_dim 四个变量
        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape

        # 如果 kv_seq_len 不等于 attention_mask 的最后一个维度大小
        # 则重新创建适当的 padding mask，通过切片在正确的位置进行调整
        if kv_seq_len != attention_mask.shape[-1]:
            attention_mask_num_tokens = attention_mask.shape[-1]
            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]

        # 调用 _get_unpad_data 方法获取索引、当前序列长度和批次中的最大序列长度
        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)

        # 将 key_layer 重塑为形状为 (batch_size * kv_seq_len, num_heads, head_dim) 的张量，并根据 indices_k 进行索引操作
        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
        # 将 value_layer 重塑为形状为 (batch_size * kv_seq_len, num_heads, head_dim) 的张量，并根据 indices_k 进行索引操作
        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)

        # 如果 query_length 等于 kv_seq_len
        if query_length == kv_seq_len:
            # 将 query_layer 重塑为形状为 (batch_size * kv_seq_len, num_heads, head_dim) 的张量，并根据 indices_k 进行索引操作
            query_layer = index_first_axis(
                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
            )
            cu_seqlens_q = cu_seqlens_k
            max_seqlen_in_batch_q = max_seqlen_in_batch_k
            indices_q = indices_k
        # 如果 query_length 等于 1
        elif query_length == 1:
            # 设置 max_seqlen_in_batch_q 为 1
            max_seqlen_in_batch_q = 1
            # 创建 cu_seqlens_q 张量，包含从 0 到 batch_size 的整数，数据类型为 torch.int32，存储在 query_layer 的设备上
            cu_seqlens_q = torch.arange(
                batch_size + 1, dtype=torch.int32, device=query_layer.device
            )  # 这里有一个 memcpy，非常糟糕。
            # 设置 indices_q 为 cu_seqlens_q 的前 n-1 项
            indices_q = cu_seqlens_q[:-1]
            # 压缩 query_layer 的第一个维度
            query_layer = query_layer.squeeze(1)
        else:
            # 使用 attention_mask 的 -query_length: 切片假定左填充，调整 attention_mask
            attention_mask = attention_mask[:, -query_length:]
            # 调用 unpad_input 方法处理 query_layer 和调整后的 attention_mask，返回解压后的输入
            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

        # 返回处理后的 query_layer, key_layer, value_layer, indices_q, cu_seqlens, max_seqlen_in_batch
        return (
            query_layer,
            key_layer,
            value_layer,
            indices_q,
            (cu_seqlens_q, cu_seqlens_k),
            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
        )
# Starcoder2SdpaAttention 类定义，继承自 Starcoder2Attention 类
class Starcoder2SdpaAttention(Starcoder2Attention):
    """
    Starcoder2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `Starcoder2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    """

    # forward 方法重写，定义了 attention 模块的前向传播过程
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
        output_attentions: bool = False,
        use_cache: bool = False,
    ):

# STARCODER2_ATTENTION_CLASSES 是一个字典，用于存储不同 attention 实现的类名及其对应的类对象
STARCODER2_ATTENTION_CLASSES = {
    "eager": Starcoder2Attention,
    "flash_attention_2": Starcoder2FlashAttention2,
    "sdpa": Starcoder2SdpaAttention,  # 将 Starcoder2SdpaAttention 类添加到字典中的 sdpa 键下
}

# Starcoder2DecoderLayer 类定义，继承自 nn.Module
class Starcoder2DecoderLayer(nn.Module):
    # 构造函数，初始化模型参数
    def __init__(self, config: Starcoder2Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size

        # 初始化 self attention 模块，根据 config._attn_implementation 决定使用哪个具体的 attention 类
        self.self_attn = STARCODER2_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)

        # 初始化 MLP 模块
        self.mlp = Starcoder2MLP(config)

        # 初始化输入层归一化层
        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
        
        # 初始化 attention 后归一化层
        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)

    # forward 方法重写，定义了解码器层的前向传播过程
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        **kwargs,
    ):
        ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        # 如果参数中包含 "padding_mask"，发出警告信息，提醒使用 "attention_mask" 替代
        if "padding_mask" in kwargs:
            warnings.warn(
                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
            )
        
        """
        Args:
            hidden_states (`torch.FloatTensor`): 输入到层的张量，形状为 `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): 注意力遮罩张量，形状为 `(batch, sequence_length)`，
                其中填充元素用 0 表示。
            output_attentions (`bool`, *optional*):
                是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions` 更多细节。
            use_cache (`bool`, *optional*):
                如果设置为 `True`，则返回 `past_key_values` 键值状态，可用于加速解码（参见 `past_key_values`）。
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): 缓存的过去键和值投影状态
        """

        # 保存输入张量作为残差连接的基准
        residual = hidden_states

        # 输入层归一化
        hidden_states = self.input_layernorm(hidden_states)

        # 自注意力机制
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
            use_cache=use_cache,
        )

        # 残差连接和层归一化
        hidden_states = residual + hidden_states

        # 全连接层
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)

        # 残差连接和输出
        hidden_states = residual + hidden_states

        # 构建输出元组
        outputs = (hidden_states,)

        # 如果需要返回注意力权重，则添加到输出中
        if output_attentions:
            outputs += (self_attn_weights,)

        # 如果需要使用缓存，则添加当前键值状态到输出中
        if use_cache:
            outputs += (present_key_value,)

        # 返回最终输出
        return outputs
# STARCODER2_START_DOCSTRING 是一个字符串，包含了关于 Starcoder2 模型的文档字符串，描述了模型的继承关系和参数说明
STARCODER2_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`Starcoder2Config`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# add_start_docstrings 是一个装饰器函数，用来为函数或类添加文档字符串注释
@add_start_docstrings(
    "The bare Starcoder2 Model outputting raw hidden-states without any specific head on top.",  # 描述该类的主要功能
    STARCODER2_START_DOCSTRING,  # 引用上面定义的 STARCODER2_START_DOCSTRING
)
# Starcoder2PreTrainedModel 类，继承自 PreTrainedModel，表示 Starcoder2 模型的基本预训练模型
class Starcoder2PreTrainedModel(PreTrainedModel):
    config_class = Starcoder2Config  # 设置模型的配置类为 Starcoder2Config
    base_model_prefix = "model"  # 设置模型的基础模型前缀为 "model"
    supports_gradient_checkpointing = True  # 支持梯度检查点
    _no_split_modules = ["Starcoder2DecoderLayer"]  # 列出不需要分割的模块名
    _skip_keys_device_placement = "past_key_values"  # 设备放置时跳过的键名
    _supports_flash_attn_2 = True  # 支持 Flash Attention 2
    _supports_sdpa = True  # 支持 Scaled Dot-Product Attention (SDPA)
    _supports_cache_class = True  # 支持缓存类

    # 初始化权重函数，根据模块类型进行权重初始化
    def _init_weights(self, module):
        std = self.config.initializer_range  # 获取配置中的初始化范围
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化线性层的权重
            if module.bias is not None:
                module.bias.data.zero_()  # 初始化线性层的偏置
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)  # 初始化嵌入层的权重
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()  # 将填充索引处的权重置零

# STARCODER2_INPUTS_DOCSTRING 是一个空字符串，可能用于后续定义输入文档字符串时的引用
STARCODER2_INPUTS_DOCSTRING = r"""
"""

# add_start_docstrings 是一个装饰器函数，用来为函数或类添加文档字符串注释
@add_start_docstrings(
    "The bare Starcoder2 Model outputting raw hidden-states without any specific head on top.",  # 描述该类的主要功能
    STARCODER2_START_DOCSTRING,  # 引用上面定义的 STARCODER2_START_DOCSTRING
)
# Starcoder2Model 类，继承自 Starcoder2PreTrainedModel，表示 Starcoder2 模型的具体实现
class Starcoder2Model(Starcoder2PreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Starcoder2DecoderLayer`]

    Args:
        config: Starcoder2Config
    """
    # 初始化函数，接受一个配置对象作为参数
    def __init__(self, config: Starcoder2Config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置填充索引为配置对象中的填充标记 ID
        self.padding_idx = config.pad_token_id
        # 设置词汇表大小为配置对象中的词汇表大小
        self.vocab_size = config.vocab_size

        # 创建一个词嵌入层对象，使用配置对象中的参数：词汇表大小、隐藏层大小、填充索引
        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
        # 设置嵌入层的 dropout 率为配置对象中的嵌入层 dropout 率
        self.embedding_dropout = config.embedding_dropout
        # 创建一个由多个解码层组成的层对象列表，每个解码层由配置对象和层索引创建
        self.layers = nn.ModuleList(
            [Starcoder2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        # 设置注意力机制的实现方式为配置对象中的注意力实现方式
        self._attn_implementation = config._attn_implementation
        # 创建一个 LayerNorm 层，用于归一化隐藏层输出，使用配置对象中的隐藏层大小和归一化 epsilon
        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
        # 禁用梯度检查点
        self.gradient_checkpointing = False
        # 调用初始化后处理方法，用于初始化权重和应用最终处理
        self.post_init()

    # 获取输入嵌入层对象的方法
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入嵌入层对象的方法
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 重写的 forward 方法，用于模型的前向传播
    @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 从 transformers.models.mistral.modeling_mistral.MistralForCausalLM 复制代码并进行修改，将 MISTRAL 替换为 STARCODER2，以匹配特定的模型和版本
class Starcoder2ForCausalLM(Starcoder2PreTrainedModel):
    # 定义了共享权重的键名列表，此处只包括 lm_head.weight
    _tied_weights_keys = ["lm_head.weight"]

    # 初始化函数，接受一个配置对象 config 作为参数
    def __init__(self, config):
        # 调用父类的初始化函数
        super().__init__(config)
        # 创建一个 Starcoder2Model 对象作为模型的基础
        self.model = Starcoder2Model(config)
        # 设置词汇表大小为配置中指定的大小
        self.vocab_size = config.vocab_size
        # 创建一个线性层 lm_head，用于生成词汇表中词的预测
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 调用后处理函数，用于初始化权重并进行最终处理
        self.post_init()

    # 返回模型的输入嵌入
    def get_input_embeddings(self):
        return self.model.embed_tokens

    # 设置模型的输入嵌入
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    # 返回模型的输出嵌入，即 lm_head 线性层
    def get_output_embeddings(self):
        return self.lm_head

    # 设置模型的输出嵌入，即 lm_head 线性层
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 设置解码器部分的模型
    def set_decoder(self, decoder):
        self.model = decoder

    # 返回解码器部分的模型
    def get_decoder(self):
        return self.model

    # 前向传播函数，接受多个输入参数并返回模型的输出结果，带有文档字符串注释和返回值注释
    @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 函数主体未提供，由后续代码块定义

    # 准备生成输入的辅助函数，接受多个输入参数，用于生成模型的输入
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
    ):
        # 函数主体未提供，由后续代码块定义
        # 如果 past_key_values 不为 None，则处理它覆盖的令牌
        if past_key_values is not None:
            # 如果 past_key_values 是 Cache 类型的实例
            if isinstance(past_key_values, Cache):
                # 获取缓存的序列长度和已处理的令牌长度
                cache_length = past_key_values.get_seq_length()
                past_length = past_key_values.seen_tokens
                # 获取缓存的最大长度
                max_cache_length = past_key_values.get_max_length()
            else:
                # 否则，从 past_key_values 中获取缓存的长度和已处理的令牌长度
                cache_length = past_length = past_key_values[0][0].shape[2]
                # 最大缓存长度设为 None
                max_cache_length = None

            # 保留未处理的令牌：
            # 1 - 如果 attention_mask 的长度超过 input_ids 的长度，则说明一些输入仅作为缓存传递（例如当 input_embeds 作为输入时）
            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
            # 2 - 如果 past_length 小于 input_ids 的长度，则 input_ids 包含所有输入令牌。根据 past_length 可以丢弃 input_ids 的部分令牌。
            elif past_length < input_ids.shape[1]:
                input_ids = input_ids[:, past_length:]
            # 3 - 否则（past_length >= input_ids.shape[1]），假设 input_ids 只包含未处理的令牌。

            # 如果即将超过最大缓存长度，需要裁剪输入的 attention_mask。
            if (
                max_cache_length is not None
                and attention_mask is not None
                and cache_length + input_ids.shape[1] > max_cache_length
            ):
                attention_mask = attention_mask[:, -max_cache_length:]

        # 获取 kwargs 中的 position_ids 参数，如果不存在并且 attention_mask 存在，则动态创建 position_ids 以用于批次生成
        position_ids = kwargs.get("position_ids", None)
        if attention_mask is not None and position_ids is None:
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            # 如果有 past_key_values，则仅保留与 input_ids 相关的 position_ids
            if past_key_values:
                position_ids = position_ids[:, -input_ids.shape[1] :]

        # 如果传入 inputs_embeds，则仅在第一次生成步骤中使用它们
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            model_inputs = {"input_ids": input_ids}

        # 更新 model_inputs 字典中的参数
        model_inputs.update(
            {
                "position_ids": position_ids,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
                "attention_mask": attention_mask,
            }
        )
        # 返回最终的 model_inputs 字典作为函数的输出
        return model_inputs
    # 定义一个函数 _reorder_cache，用于重新排序缓存中的过去键值
    def _reorder_cache(past_key_values, beam_idx):
        # 初始化一个空的元组 reordered_past 用于存储重新排序后的过去键值
        reordered_past = ()
        # 遍历 past_key_values 中的每一层的过去键值
        for layer_past in past_key_values:
            # 对每一层的 past_state 应用索引选择操作，根据 beam_idx 重新排列
            # 并将重新排序后的结果添加到 reordered_past 中
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的 past_key_values
        return reordered_past
# 定义了一个用于序列分类任务的 Starcoder2 模型，其顶部有一个线性层作为分类头部。
# 此模型 [`Starcoder2ForSequenceClassification`] 使用最后一个标记进行分类，类似于其他因果模型（如 GPT-2）的做法。

# 当分类任务依赖于最后一个标记时，需要知道最后一个标记的位置。如果配置中定义了 `pad_token_id`，则找到每行中不是填充标记的最后一个标记。
# 如果未定义 `pad_token_id`，则简单地取每个批次中每行的最后一个值。当传递 `inputs_embeds` 而不是 `input_ids` 时，由于无法猜测填充标记，
# 也会采用相同的策略（取每个批次中每行的最后一个值）。

@add_start_docstrings(
    """
    The Starcoder2 Model transformer with a sequence classification head on top (linear layer).

    [`Starcoder2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    STARCODER2_START_DOCSTRING,
)
# 从 transformers.models.llama.modeling_llama.LlamaForSequenceClassification 复制而来，将其中的 Llama 替换为 Starcoder2，LLAMA 替换为 STARCODER2
class Starcoder2ForSequenceClassification(Starcoder2PreTrainedModel):
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 初始化时指定分类的类别数目
        self.num_labels = config.num_labels
        # 创建 Starcoder2Model 模型实例
        self.model = Starcoder2Model(config)
        # 初始化一个线性层，用于分类，输入大小为 config.hidden_size，输出大小为 num_labels，无偏置项
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输入的嵌入表示
    def get_input_embeddings(self):
        return self.model.embed_tokens

    # 设置输入的嵌入表示
    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

`.\models\starcoder2\init.py`

# 导入类型检查相关模块
from typing import TYPE_CHECKING

# 导入必要的依赖和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_starcoder2": ["STARCODER2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Starcoder2Config"],
}

# 检查是否存在 Torch 库，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 Torch 可用，添加模型相关的导入结构
    _import_structure["modeling_starcoder2"] = [
        "Starcoder2ForCausalLM",
        "Starcoder2Model",
        "Starcoder2PreTrainedModel",
        "Starcoder2ForSequenceClassification",
    ]

# 如果类型检查开启
if TYPE_CHECKING:
    # 导入配置相关的模块和类
    from .configuration_starcoder2 import STARCODER2_PRETRAINED_CONFIG_ARCHIVE_MAP, Starcoder2Config

    # 再次检查 Torch 是否可用，若不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 导入模型相关的模块和类
        from .modeling_starcoder2 import (
            Starcoder2ForCausalLM,
            Starcoder2ForSequenceClassification,
            Starcoder2Model,
            Starcoder2PreTrainedModel,
        )

# 如果不是类型检查模式
else:
    import sys

    # 将当前模块替换为 LazyModule，用于惰性加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\superpoint\configuration_superpoint.py`

# 导入必要的模块和类
from typing import List

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...utils import logging  # 导入日志工具

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义预训练模型与其配置文件的映射关系
SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "magic-leap-community/superpoint": "https://huggingface.co/magic-leap-community/superpoint/blob/main/config.json"
}

# 定义 SuperPointConfig 类，用于存储 SuperPointForKeypointDetection 模型的配置信息
class SuperPointConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SuperPointForKeypointDetection`]. It is used to instantiate a
    SuperPoint model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the SuperPoint
    [magic-leap-community/superpoint](https://huggingface.co/magic-leap-community/superpoint) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        encoder_hidden_sizes (`List`, *optional*, defaults to `[64, 64, 128, 128]`):
            The number of channels in each convolutional layer in the encoder.
        decoder_hidden_size (`int`, *optional*, defaults to 256): The hidden size of the decoder.
        keypoint_decoder_dim (`int`, *optional*, defaults to 65): The output dimension of the keypoint decoder.
        descriptor_decoder_dim (`int`, *optional*, defaults to 256): The output dimension of the descriptor decoder.
        keypoint_threshold (`float`, *optional*, defaults to 0.005):
            The threshold to use for extracting keypoints.
        max_keypoints (`int`, *optional*, defaults to -1):
            The maximum number of keypoints to extract. If `-1`, will extract all keypoints.
        nms_radius (`int`, *optional*, defaults to 4):
            The radius for non-maximum suppression.
        border_removal_distance (`int`, *optional*, defaults to 4):
            The distance from the border to remove keypoints.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    Example:
    ```
    >>> from transformers import SuperPointConfig, SuperPointForKeypointDetection

    >>> # Initializing a SuperPoint superpoint style configuration
    # 设置模型类型为 "superpoint"
    model_type = "superpoint"

    # 定义 SuperPointConfig 类，用于配置超点模型的参数
    def __init__(
        self,
        encoder_hidden_sizes: List[int] = [64, 64, 128, 128],  # 编码器隐藏层大小列表
        decoder_hidden_size: int = 256,  # 解码器隐藏层大小
        keypoint_decoder_dim: int = 65,  # 关键点解码器维度
        descriptor_decoder_dim: int = 256,  # 描述符解码器维度
        keypoint_threshold: float = 0.005,  # 关键点阈值
        max_keypoints: int = -1,  # 最大关键点数
        nms_radius: int = 4,  # 非极大值抑制半径
        border_removal_distance: int = 4,  # 边界去除距离
        initializer_range=0.02,  # 初始化器范围
        **kwargs,  # 其他关键字参数
    ):
        # 将参数赋值给对象的属性
        self.encoder_hidden_sizes = encoder_hidden_sizes
        self.decoder_hidden_size = decoder_hidden_size
        self.keypoint_decoder_dim = keypoint_decoder_dim
        self.descriptor_decoder_dim = descriptor_decoder_dim
        self.keypoint_threshold = keypoint_threshold
        self.max_keypoints = max_keypoints
        self.nms_radius = nms_radius
        self.border_removal_distance = border_removal_distance
        self.initializer_range = initializer_range

        # 调用父类的构造方法，传递其他关键字参数
        super().__init__(**kwargs)

`.\models\superpoint\convert_superpoint_to_pytorch.py`

# 导入所需的模块和库
import argparse  # 用于解析命令行参数
import os  # 提供与操作系统交互的功能

import requests  # 发送 HTTP 请求的库
import torch  # 提供深度学习框架
from PIL import Image  # Python Imaging Library，用于图像处理

from transformers import SuperPointConfig, SuperPointForKeypointDetection, SuperPointImageProcessor

# 创建并返回一个 SuperPointConfig 对象，配置模型的各种参数
def get_superpoint_config():
    config = SuperPointConfig(
        encoder_hidden_sizes=[64, 64, 128, 128],  # 编码器各层隐藏单元数
        decoder_hidden_size=256,  # 解码器隐藏层单元数
        keypoint_decoder_dim=65,  # 关键点解码器维度
        descriptor_decoder_dim=256,  # 描述符解码器维度
        keypoint_threshold=0.005,  # 关键点检测阈值
        max_keypoints=-1,  # 最大关键点数目
        nms_radius=4,  # 非最大抑制半径
        border_removal_distance=4,  # 边缘去除距离
        initializer_range=0.02,  # 参数初始化范围
    )
    return config

# 创建并返回一个包含权重重命名信息的列表，用于加载预训练模型
def create_rename_keys(config, state_dict):
    rename_keys = []

    # 编码器权重
    rename_keys.append(("conv1a.weight", "encoder.conv_blocks.0.conv_a.weight"))
    rename_keys.append(("conv1b.weight", "encoder.conv_blocks.0.conv_b.weight"))
    rename_keys.append(("conv2a.weight", "encoder.conv_blocks.1.conv_a.weight"))
    rename_keys.append(("conv2b.weight", "encoder.conv_blocks.1.conv_b.weight"))
    rename_keys.append(("conv3a.weight", "encoder.conv_blocks.2.conv_a.weight"))
    rename_keys.append(("conv3b.weight", "encoder.conv_blocks.2.conv_b.weight"))
    rename_keys.append(("conv4a.weight", "encoder.conv_blocks.3.conv_a.weight"))
    rename_keys.append(("conv4b.weight", "encoder.conv_blocks.3.conv_b.weight"))
    rename_keys.append(("conv1a.bias", "encoder.conv_blocks.0.conv_a.bias"))
    rename_keys.append(("conv1b.bias", "encoder.conv_blocks.0.conv_b.bias"))
    rename_keys.append(("conv2a.bias", "encoder.conv_blocks.1.conv_a.bias"))
    rename_keys.append(("conv2b.bias", "encoder.conv_blocks.1.conv_b.bias"))
    rename_keys.append(("conv3a.bias", "encoder.conv_blocks.2.conv_a.bias"))
    rename_keys.append(("conv3b.bias", "encoder.conv_blocks.2.conv_b.bias"))
    rename_keys.append(("conv4a.bias", "encoder.conv_blocks.3.conv_a.bias"))
    rename_keys.append(("conv4b.bias", "encoder.conv_blocks.3.conv_b.bias"))

    # 关键点解码器权重
    rename_keys.append(("convPa.weight", "keypoint_decoder.conv_score_a.weight"))
    rename_keys.append(("convPb.weight", "keypoint_decoder.conv_score_b.weight"))
    rename_keys.append(("convPa.bias", "keypoint_decoder.conv_score_a.bias"))
    rename_keys.append(("convPb.bias", "keypoint_decoder.conv_score_b.bias"))

    # 描述符解码器权重
    # 将 ("convDa.weight", "descriptor_decoder.conv_descriptor_a.weight") 元组添加到 rename_keys 列表中
    rename_keys.append(("convDa.weight", "descriptor_decoder.conv_descriptor_a.weight"))
    # 将 ("convDb.weight", "descriptor_decoder.conv_descriptor_b.weight") 元组添加到 rename_keys 列表中
    rename_keys.append(("convDb.weight", "descriptor_decoder.conv_descriptor_b.weight"))
    # 将 ("convDa.bias", "descriptor_decoder.conv_descriptor_a.bias") 元组添加到 rename_keys 列表中
    rename_keys.append(("convDa.bias", "descriptor_decoder.conv_descriptor_a.bias"))
    # 将 ("convDb.bias", "descriptor_decoder.conv_descriptor_b.bias") 元组添加到 rename_keys 列表中
    rename_keys.append(("convDb.bias", "descriptor_decoder.conv_descriptor_b.bias"))
    
    # 返回 rename_keys 列表，该列表包含了需要重命名的键值对元组
    return rename_keys
# 重命名字典中的键
def rename_key(dct, old, new):
    # 弹出旧键对应的值
    val = dct.pop(old)
    # 将值与新键关联起来
    dct[new] = val

# 准备图片数据
def prepare_imgs():
    # 第一张图片的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 获取第一张图片的二进制数据并打开为图像对象
    im1 = Image.open(requests.get(url, stream=True).raw)
    # 第二张图片的 URL
    url = "http://images.cocodataset.org/test-stuff2017/000000004016.jpg"
    # 获取第二张图片的二进制数据并打开为图像对象
    im2 = Image.open(requests.get(url, stream=True).raw)
    # 返回图片对象列表
    return [im1, im2]

# 用于禁用 Torch 的梯度计算
@torch.no_grad()
def convert_superpoint_checkpoint(checkpoint_url, pytorch_dump_folder_path, save_model, push_to_hub, test_mode=False):
    """
    Copy/paste/tweak model's weights to our SuperPoint structure.
    """

    # 打印信息：从检查点下载原始模型
    print("Downloading original model from checkpoint...")
    # 获取 SuperPoint 模型的配置信息
    config = get_superpoint_config()

    # 从 URL 加载原始模型的状态字典
    original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)

    # 打印信息：转换模型参数
    print("Converting model parameters...")
    
    # 创建重命名键列表
    rename_keys = create_rename_keys(config, original_state_dict)
    # 复制原始状态字典
    new_state_dict = original_state_dict.copy()
    # 遍历重命名键列表，为新状态字典重命名键
    for src, dest in rename_keys:
        rename_key(new_state_dict, src, dest)

    # 加载 HuggingFace 模型
    model = SuperPointForKeypointDetection(config)
    # 加载新的状态字典到模型
    model.load_state_dict(new_state_dict)
    # 设置模型为评估模式
    model.eval()
    # 打印信息：成功加载模型的权重
    print("Successfully loaded weights in the model")

    # 实例化 SuperPoint 图像处理器
    preprocessor = SuperPointImageProcessor()
    # 准备输入数据：使用 prepare_imgs 函数准备的图像数据
    inputs = preprocessor(images=prepare_imgs(), return_tensors="pt")
    # 模型推理：获取模型的输出结果
    outputs = model(**inputs)

    # 如果 test_mode 为 True，则检查模型输出是否与原始结果匹配
    if test_mode:
        # 计算非零值的数量，以确保模型输出与原始结果匹配
        torch.count_nonzero(outputs.mask[0])
        # 期望的关键点形状和分数形状
        expected_keypoints_shape = (2, 830, 2)
        expected_scores_shape = (2, 830)
        expected_descriptors_shape = (2, 830, 256)

        # 期望的关键点、分数和描述子的值
        expected_keypoints_values = torch.tensor([[480.0, 9.0], [494.0, 9.0], [489.0, 16.0]])
        expected_scores_values = torch.tensor([0.0064, 0.0140, 0.0595, 0.0728, 0.5170, 0.0175, 0.1523, 0.2055, 0.0336])
        expected_descriptors_value = torch.tensor(-0.1096)
        
        # 断言：检查模型输出的关键点、分数和描述子是否与预期匹配
        assert outputs.keypoints.shape == expected_keypoints_shape
        assert outputs.scores.shape == expected_scores_shape
        assert outputs.descriptors.shape == expected_descriptors_shape

        assert torch.allclose(outputs.keypoints[0, :3], expected_keypoints_values, atol=1e-3)
        assert torch.allclose(outputs.scores[0, :9], expected_scores_values, atol=1e-3)
        assert torch.allclose(outputs.descriptors[0, 0, 0], expected_descriptors_value, atol=1e-3)
        # 打印信息：模型输出与原始结果匹配
        print("Model outputs match the original results!")
    # 如果需要保存模型
    if save_model:
        # 打印信息：保存模型到本地
        print("Saving model to local...")
        
        # 如果指定的路径不存在文件夹，则创建文件夹
        if not os.path.isdir(pytorch_dump_folder_path):
            os.mkdir(pytorch_dump_folder_path)

        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 将预处理器保存到指定路径
        preprocessor.save_pretrained(pytorch_dump_folder_path)

        # 设置模型名称为"superpoint"
        model_name = "superpoint"
        
        # 如果需要将模型推送到 hub
        if push_to_hub:
            # 打印信息：推送模型到 hub
            print(f"Pushing {model_name} to the hub...")
            
        # 将模型推送到 hub，并使用模型名称
        model.push_to_hub(model_name)
        # 将预处理器推送到 hub，并使用模型名称
        preprocessor.push_to_hub(model_name)
if __name__ == "__main__":
    # 如果脚本作为主程序执行，开始执行以下代码块
    
    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # 必选参数
    parser.add_argument(
        "--checkpoint_url",
        default="https://github.com/magicleap/SuperPointPretrainedNetwork/raw/master/superpoint_v1.pth",
        type=str,
        help="URL of the original SuperPoint checkpoint you'd like to convert.",
    )
    # 添加一个命令行参数，用于指定 SuperPoint 模型的原始检查点的下载地址

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default="model",
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加一个命令行参数，用于指定输出 PyTorch 模型的目录路径

    parser.add_argument("--save_model", action="store_true", help="Save model to local")
    # 添加一个命令行参数，指定是否将模型保存到本地

    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image preprocessor to the hub")
    # 添加一个命令行参数，指定是否将模型和图像预处理器推送到某个中心化的平台（比如模型仓库）

    args = parser.parse_args()
    # 解析命令行参数，并将结果存储在 args 变量中

    convert_superpoint_checkpoint(
        args.checkpoint_url, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub
    )
    # 调用函数 convert_superpoint_checkpoint，传递命令行参数中的相关选项作为参数

`.\models\superpoint\image_processing_superpoint.py`

# 版权声明和许可证信息
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""SuperPoint 的图像处理类。"""

from typing import Dict, Optional, Union  # 导入类型提示模块

import numpy as np  # 导入NumPy库

from ... import is_vision_available, requires_backends  # 导入可视化模块相关函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict  # 导入图像处理相关函数和类
from ...image_transforms import resize, to_channel_dimension_format  # 导入图像变换函数
from ...image_utils import (  # 导入图像工具函数
    ChannelDimension,
    ImageInput,
    infer_channel_dimension_format,
    is_scaled_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
)
from ...utils import TensorType, logging  # 导入工具函数和日志模块

if is_vision_available():  # 检查视觉模块是否可用
    import PIL  # 如果可用，则导入PIL库

logger = logging.get_logger(__name__)  # 获取日志记录器

def is_grayscale(
    image: ImageInput,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
):
    # 检查图像是否为灰度图像的函数
    if input_data_format == ChannelDimension.FIRST:  # 如果输入数据格式是通道维度在最前面
        return np.all(image[0, ...] == image[1, ...]) and np.all(image[1, ...] == image[2, ...])  # 检查RGB通道是否相同
    elif input_data_format == ChannelDimension.LAST:  # 如果输入数据格式是通道维度在最后面
        return np.all(image[..., 0] == image[..., 1]) and np.all(image[..., 1] == image[..., 2])  # 检查RGB通道是否相同

def convert_to_grayscale(
    image: ImageInput,
    input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> ImageInput:
    """
    使用NTSC公式将图像转换为灰度格式。仅支持numpy和PIL Image。TODO：支持torch和tensorflow的灰度转换

    该函数本应返回一个单通道图像，但由于在https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446中讨论的问题，
    它返回一个每个通道都具有相同值的三通道图像。

    Args:
        image (Image):
            要转换的图像。
        input_data_format (`ChannelDimension`或`str`，*可选*):
            输入图像的通道维度格式。
    """
    requires_backends(convert_to_grayscale, ["vision"])  # 要求后端支持视觉处理

    if isinstance(image, np.ndarray):  # 如果图像是NumPy数组
        if input_data_format == ChannelDimension.FIRST:  # 如果输入数据格式是通道维度在最前面
            # 使用NTSC公式将RGB图像转换为灰度图像
            gray_image = image[0, ...] * 0.2989 + image[1, ...] * 0.5870 + image[2, ...] * 0.1140
            gray_image = np.stack([gray_image] * 3, axis=0)  # 将灰度图像复制为三通道的形式
        elif input_data_format == ChannelDimension.LAST:  # 如果输入数据格式是通道维度在最后面
            # 使用NTSC公式将RGB图像转换为灰度图像
            gray_image = image[..., 0] * 0.2989 + image[..., 1] * 0.5870 + image[..., 2] * 0.1140
            gray_image = np.stack([gray_image] * 3, axis=-1)  # 将灰度图像复制为三通道的形式
        return gray_image  # 返回灰度图像
    # 如果参数 `image` 不是 `PIL.Image.Image` 类型的实例，则直接返回该参数
    if not isinstance(image, PIL.Image.Image):
        return image
    
    # 将图像转换为灰度图像
    image = image.convert("L")
    
    # 返回转换后的图像
    return image
# 定义一个名为 SuperPointImageProcessor 的类，继承自 BaseImageProcessor 类
class SuperPointImageProcessor(BaseImageProcessor):
    r"""
    Constructs a SuperPoint image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            控制是否将图像的高度和宽度调整为指定的 `size`。可以在 `preprocess` 方法中被 `do_resize` 覆盖。
        size (`Dict[str, int]` *optional*, defaults to `{"height": 480, "width": 640}`):
            调整后的输出图像的分辨率。仅在 `do_resize` 设置为 `True` 时有效。可以在 `preprocess` 方法中被 `size` 覆盖。
        do_rescale (`bool`, *optional*, defaults to `True`):
            是否按指定的比例因子 `rescale_factor` 对图像进行重新缩放。可以在 `preprocess` 方法中被 `do_rescale` 覆盖。
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            如果重新缩放图像，则使用的比例因子。可以在 `preprocess` 方法中被 `rescale_factor` 覆盖。
    """

    # 定义类属性 model_input_names，表示模型输入的名称列表，这里只包含 "pixel_values"
    model_input_names = ["pixel_values"]

    # 定义初始化方法，接受一系列参数来配置 SuperPointImageProcessor 的实例
    def __init__(
        self,
        do_resize: bool = True,
        size: Dict[str, int] = None,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        **kwargs,
    ) -> None:
        # 调用父类 BaseImageProcessor 的初始化方法，并传递额外的关键字参数
        super().__init__(**kwargs)
        
        # 如果未提供 size 参数，则使用默认的 {"height": 480, "width": 640}
        size = size if size is not None else {"height": 480, "width": 640}
        
        # 调用辅助函数 get_size_dict，根据默认设置将 size 转换为包含高度和宽度的字典，确保不强制为正方形
        size = get_size_dict(size, default_to_square=False)

        # 将参数赋值给实例变量
        self.do_resize = do_resize
        self.size = size
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor

    # 定义 resize 方法，用于调整图像大小
    def resize(
        self,
        image: np.ndarray,
        size: Dict[str, int],
        data_format: Optional[Union[str, ChannelDimension]] = None,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):
        """
        Resize an image.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary of the form `{"height": int, "width": int}`, specifying the size of the output image.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the output image. If not provided, it will be inferred from the input
                image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        # 根据指定的大小字典，获取调整后的大小（可能会保持长宽比）
        size = get_size_dict(size, default_to_square=False)

        # 调用 resize 函数进行图像调整，传入调整后的大小、数据格式和其他关键字参数
        return resize(
            image,
            size=(size["height"], size["width"]),
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def preprocess(
        self,
        images,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: ChannelDimension = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,

`.\models\superpoint\modeling_superpoint.py`

# 版权声明和许可信息，指明代码归属和使用许可
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""PyTorch SuperPoint model."""

# 引入必要的库和模块
from dataclasses import dataclass
from typing import Optional, Tuple, Union

import torch
from torch import nn

# 引入 transformers 库中的模块和类
from transformers import PreTrainedModel
from transformers.modeling_outputs import (
    BaseModelOutputWithNoAttention,
)
from transformers.models.superpoint.configuration_superpoint import SuperPointConfig

# 引入内部工具函数和类
from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
from ...utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置和检查点名称
_CONFIG_FOR_DOC = "SuperPointConfig"
_CHECKPOINT_FOR_DOC = "magic-leap-community/superpoint"

# 预训练模型的存档列表
SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST = ["magic-leap-community/superpoint"]

# 从图像中移除靠近边界的关键点的函数
def remove_keypoints_from_borders(
    keypoints: torch.Tensor, scores: torch.Tensor, border: int, height: int, width: int
) -> Tuple[torch.Tensor, torch.Tensor]:
    """Removes keypoints (and their associated scores) that are too close to the border"""
    mask_h = (keypoints[:, 0] >= border) & (keypoints[:, 0] < (height - border))
    mask_w = (keypoints[:, 1] >= border) & (keypoints[:, 1] < (width - border))
    mask = mask_h & mask_w
    return keypoints[mask], scores[mask]

# 保留具有最高分数的 k 个关键点的函数
def top_k_keypoints(keypoints: torch.Tensor, scores: torch.Tensor, k: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """Keeps the k keypoints with highest score"""
    if k >= len(keypoints):
        return keypoints, scores
    scores, indices = torch.topk(scores, k, dim=0)
    return keypoints[indices], scores

# 应用非最大抑制算法的函数，用于处理关键点的分数
def simple_nms(scores: torch.Tensor, nms_radius: int) -> torch.Tensor:
    """Applies non-maximum suppression on scores"""
    if nms_radius < 0:
        raise ValueError("Expected positive values for nms_radius")

    def max_pool(x):
        return nn.functional.max_pool2d(x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)

    zeros = torch.zeros_like(scores)
    max_mask = scores == max_pool(scores)
    for _ in range(2):
        supp_mask = max_pool(max_mask.float()) > 0
        supp_scores = torch.where(supp_mask, zeros, scores)
        new_max_mask = supp_scores == max_pool(supp_scores)
        max_mask = max_mask | (new_max_mask & (~supp_mask))
    return torch.where(max_mask, scores, zeros)

# 定义一个输出类，继承自 ModelOutput，用于描述图像中的点的信息
@dataclass
class ImagePointDescriptionOutput(ModelOutput):
    """
    # 图像关键点描述模型输出的基类。由于关键点检测的性质，关键点数量在图像之间可以不固定，这使得批处理变得复杂。
    # 在图像批处理中，将关键点的最大数量设置为关键点、分数和描述符张量的维度。掩码张量用于指示关键点、分数和描述符张量中的哪些值是关键点信息，哪些是填充。
    
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            解码器模型最后一层输出的隐藏状态序列。
        keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`):
            给定图像中预测关键点的相对（x，y）坐标。
        scores (`torch.FloatTensor` of shape `(batch_size, num_keypoints)`):
            预测关键点的分数。
        descriptors (`torch.FloatTensor` of shape `(batch_size, num_keypoints, descriptor_size)`):
            预测关键点的描述符。
        mask (`torch.BoolTensor` of shape `(batch_size, num_keypoints)`):
            指示关键点、分数和描述符张量中哪些值是关键点信息的掩码。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, 返回当 `output_hidden_states=True` 传递或 `config.output_hidden_states=True` 时):
            `torch.FloatTensor` 元组（如果模型有嵌入层，则为输出的一个 + 每个阶段的一个）。模型在每个阶段输出的隐藏状态（也称为特征图）。
    """
    
    # 最后一层隐藏状态，默认为 None
    last_hidden_state: torch.FloatTensor = None
    # 关键点坐标，默认为 None（可选）
    keypoints: Optional[torch.IntTensor] = None
    # 关键点分数，默认为 None（可选）
    scores: Optional[torch.FloatTensor] = None
    # 关键点描述符，默认为 None（可选）
    descriptors: Optional[torch.FloatTensor] = None
    # 关键点掩码，默认为 None（可选）
    mask: Optional[torch.BoolTensor] = None
    # 隐藏状态，默认为 None（可选）
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
class SuperPointConvBlock(nn.Module):
    def __init__(
        self, config: SuperPointConfig, in_channels: int, out_channels: int, add_pooling: bool = False
    ) -> None:
        super().__init__()
        # 定义第一个卷积层，输入通道数为in_channels，输出通道数为out_channels，使用3x3的卷积核
        self.conv_a = nn.Conv2d(
            in_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
        )
        # 定义第二个卷积层，输入输出通道数均为out_channels，使用3x3的卷积核
        self.conv_b = nn.Conv2d(
            out_channels,
            out_channels,
            kernel_size=3,
            stride=1,
            padding=1,
        )
        # ReLU激活函数，inplace=True表示原地操作，节省内存
        self.relu = nn.ReLU(inplace=True)
        # 如果add_pooling为True，则定义最大池化层，池化核大小为2x2，步长为2
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2) if add_pooling else None

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 对输入进行第一层卷积和ReLU激活
        hidden_states = self.relu(self.conv_a(hidden_states))
        # 对结果进行第二层卷积和ReLU激活
        hidden_states = self.relu(self.conv_b(hidden_states))
        # 如果定义了池化层，则对结果进行最大池化操作
        if self.pool is not None:
            hidden_states = self.pool(hidden_states)
        return hidden_states


class SuperPointEncoder(nn.Module):
    """
    SuperPoint encoder module. It is made of 4 convolutional layers with ReLU activation and max pooling, reducing the
     dimensionality of the image.
    """

    def __init__(self, config: SuperPointConfig) -> None:
        super().__init__()
        # SuperPoint使用单通道图像
        self.input_dim = 1

        conv_blocks = []
        # 添加第一个卷积块，使用SuperPointConvBlock定义的卷积结构，添加了最大池化
        conv_blocks.append(
            SuperPointConvBlock(config, self.input_dim, config.encoder_hidden_sizes[0], add_pooling=True)
        )
        # 添加中间的卷积块，遍历encoder_hidden_sizes并构建多个SuperPointConvBlock实例
        for i in range(1, len(config.encoder_hidden_sizes) - 1):
            conv_blocks.append(
                SuperPointConvBlock(
                    config, config.encoder_hidden_sizes[i - 1], config.encoder_hidden_sizes[i], add_pooling=True
                )
            )
        # 添加最后一个卷积块，不添加最大池化
        conv_blocks.append(
            SuperPointConvBlock(
                config, config.encoder_hidden_sizes[-2], config.encoder_hidden_sizes[-1], add_pooling=False
            )
        )
        # 将所有卷积块封装为ModuleList
        self.conv_blocks = nn.ModuleList(conv_blocks)

    def forward(
        self,
        input,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
        all_hidden_states = () if output_hidden_states else None

        # 对每个卷积块进行前向传播，保存所有隐藏状态（如果需要）
        for conv_block in self.conv_blocks:
            input = conv_block(input)
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (input,)
        output = input
        # 根据return_dict返回不同的输出格式
        if not return_dict:
            return tuple(v for v in [output, all_hidden_states] if v is not None)

        return BaseModelOutputWithNoAttention(
            last_hidden_state=output,
            hidden_states=all_hidden_states,
        )


class SuperPointInterestPointDecoder(nn.Module):
    """
    The SuperPointInterestPointDecoder uses the output of the SuperPointEncoder to compute the keypoint with scores.
    """
    def __init__(self, config: SuperPointConfig) -> None:
        super().__init__()
        self.keypoint_threshold = config.keypoint_threshold
        self.max_keypoints = config.max_keypoints
        self.nms_radius = config.nms_radius
        self.border_removal_distance = config.border_removal_distance

        self.relu = nn.ReLU(inplace=True)  # 初始化 ReLU 激活函数
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)  # 初始化最大池化层
        self.conv_score_a = nn.Conv2d(
            config.encoder_hidden_sizes[-1],  # 输入通道数为编码器最后隐藏层的大小
            config.decoder_hidden_size,  # 输出通道数为解码器隐藏层的大小
            kernel_size=3, stride=1, padding=1,  # 使用 3x3 的卷积核，填充为1
        )
        self.conv_score_b = nn.Conv2d(
            config.decoder_hidden_size,  # 输入通道数为解码器隐藏层的大小
            config.keypoint_decoder_dim,  # 输出通道数为关键点解码器的维度大小
            kernel_size=1, stride=1, padding=0  # 使用 1x1 的卷积核，无填充
        )

    def forward(self, encoded: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        scores = self._get_pixel_scores(encoded)
        keypoints, scores = self._extract_keypoints(scores)

        return keypoints, scores

    def _get_pixel_scores(self, encoded: torch.Tensor) -> torch.Tensor:
        """根据编码器输出，计算图像每个像素点的分数"""
        scores = self.relu(self.conv_score_a(encoded))  # 使用 ReLU 激活函数对卷积结果进行非线性处理
        scores = self.conv_score_b(scores)  # 继续卷积操作
        scores = nn.functional.softmax(scores, 1)[:, :-1]  # 对最后一维进行 softmax 操作，生成概率分布
        batch_size, _, height, width = scores.shape
        scores = scores.permute(0, 2, 3, 1).reshape(batch_size, height, width, 8, 8)  # 调整张量维度
        scores = scores.permute(0, 1, 3, 2, 4).reshape(batch_size, height * 8, width * 8)  # 再次调整张量维度
        scores = simple_nms(scores, self.nms_radius)  # 对分数进行简单的非极大值抑制处理
        return scores

    def _extract_keypoints(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """根据分数提取关键点像素，用于描述符计算"""
        _, height, width = scores.shape

        # 根据分数阈值筛选关键点
        keypoints = torch.nonzero(scores[0] > self.keypoint_threshold)
        scores = scores[0][tuple(keypoints.t())]

        # 去除靠近图像边界的关键点
        keypoints, scores = remove_keypoints_from_borders(
            keypoints, scores, self.border_removal_distance, height * 8, width * 8
        )

        # 保留分数最高的 k 个关键点
        if self.max_keypoints >= 0:
            keypoints, scores = top_k_keypoints(keypoints, scores, self.max_keypoints)

        # 将 (y, x) 转换为 (x, y)
        keypoints = torch.flip(keypoints, [1]).float()

        return keypoints, scores
class SuperPointDescriptorDecoder(nn.Module):
    """
    The SuperPointDescriptorDecoder uses the outputs of both the SuperPointEncoder and the
    SuperPointInterestPointDecoder to compute the descriptors at the keypoints locations.

    The descriptors are first computed by a convolutional layer, then normalized to have a norm of 1. The descriptors
    are then interpolated at the keypoints locations.
    """

    def __init__(self, config: SuperPointConfig) -> None:
        super().__init__()

        # ReLU 激活函数，inplace=True 表示原地操作
        self.relu = nn.ReLU(inplace=True)
        # 最大池化层，kernel_size=2 表示池化核大小为 2x2，stride=2 表示步幅为 2
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # 第一个描述符卷积层，输入通道数为 config.encoder_hidden_sizes[-1]，输出通道数为 config.decoder_hidden_size，卷积核大小为 3x3
        self.conv_descriptor_a = nn.Conv2d(
            config.encoder_hidden_sizes[-1],
            config.decoder_hidden_size,
            kernel_size=3,
            stride=1,
            padding=1,
        )
        # 第二个描述符卷积层，输入通道数为 config.decoder_hidden_size，输出通道数为 config.descriptor_decoder_dim，卷积核大小为 1x1
        self.conv_descriptor_b = nn.Conv2d(
            config.decoder_hidden_size,
            config.descriptor_decoder_dim,
            kernel_size=1,
            stride=1,
            padding=0,
        )

    def forward(self, encoded: torch.Tensor, keypoints: torch.Tensor) -> torch.Tensor:
        """Based on the encoder output and the keypoints, compute the descriptors for each keypoint"""
        # 计算描述符，先经过第一个卷积层和 ReLU 激活函数，再经过第二个卷积层
        descriptors = self.conv_descriptor_b(self.relu(self.conv_descriptor_a(encoded)))
        # 对描述符进行 L2 归一化，dim=1 表示在通道维度进行归一化
        descriptors = nn.functional.normalize(descriptors, p=2, dim=1)

        # 插值计算描述符在关键点位置处的值
        descriptors = self._sample_descriptors(keypoints[None], descriptors[0][None], 8)[0]

        # 将描述符的维度从 [descriptor_dim, num_keypoints] 转置为 [num_keypoints, descriptor_dim]
        descriptors = torch.transpose(descriptors, 0, 1)

        return descriptors

    @staticmethod
    def _sample_descriptors(keypoints, descriptors, scale: int = 8) -> torch.Tensor:
        """Interpolate descriptors at keypoint locations"""
        batch_size, num_channels, height, width = descriptors.shape
        # 调整关键点位置，将其缩放并归一化到 (-1, 1) 的范围内
        keypoints = keypoints - scale / 2 + 0.5
        divisor = torch.tensor([[(width * scale - scale / 2 - 0.5), (height * scale - scale / 2 - 0.5)]])
        divisor = divisor.to(keypoints)
        keypoints /= divisor
        keypoints = keypoints * 2 - 1  # normalize to (-1, 1)
        kwargs = {"align_corners": True} if is_torch_greater_or_equal_than_1_13 else {}
        # 使用双线性插值在描述符上进行网格采样，调整关键点位置
        keypoints = keypoints.view(batch_size, 1, -1, 2)
        descriptors = nn.functional.grid_sample(descriptors, keypoints, mode="bilinear", **kwargs)
        # 调整描述符的形状 [batch_size, descriptor_decoder_dim, num_channels, num_keypoints] -> [batch_size, descriptor_decoder_dim, num_keypoints]
        descriptors = descriptors.reshape(batch_size, num_channels, -1)
        # 对描述符进行 L2 归一化，dim=1 表示在通道维度进行归一化
        descriptors = nn.functional.normalize(descriptors, p=2, dim=1)
        return descriptors


class SuperPointPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    config_class = SuperPointConfig
    base_model_prefix = "superpoint"
    main_input_name = "pixel_values"
    supports_gradient_checkpointing = False

    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        # 如果 module 是线性层或卷积层
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 使用正态分布初始化权重，均值为 0，标准差为配置中的初始化范围
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置，将其初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果 module 是 LayerNorm 层
        elif isinstance(module, nn.LayerNorm):
            # 将偏置初始化为零
            module.bias.data.zero_()
            # 将权重初始化为全一
            module.weight.data.fill_(1.0)

    def extract_one_channel_pixel_values(self, pixel_values: torch.FloatTensor) -> torch.FloatTensor:
        """
        Assuming pixel_values has shape (batch_size, 3, height, width), and that all channels values are the same,
        extract the first channel value to get a tensor of shape (batch_size, 1, height, width) for SuperPoint. This is
        a workaround for the issue discussed in :
        https://github.com/huggingface/transformers/pull/25786#issuecomment-1730176446

        Args:
            pixel_values: torch.FloatTensor of shape (batch_size, 3, height, width)

        Returns:
            pixel_values: torch.FloatTensor of shape (batch_size, 1, height, width)

        """
        # 提取第一个通道的像素值，以解决超级点模型的问题
        return pixel_values[:, 0, :, :][:, None, :, :]
# 定义一个包含详细信息的 PyTorch 模型类，继承自 `torch.nn.Module`。此模型可以作为常规的 PyTorch 模块使用，有关使用和行为的所有事项请参考 PyTorch 文档。
SUPERPOINT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SuperPointConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
    """

# 描述模型输入文档的字符串，包括像素值、是否返回隐藏状态、是否返回字典等参数的详细信息
SUPERPOINT_INPUTS_DOCSTRING = r"""
Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Pixel values. Pixel values can be obtained using [`SuperPointImageProcessor`]. See
        [`SuperPointImageProcessor.__call__`] for details.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more
        detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
    """

# 使用装饰器 `add_start_docstrings` 将超级点模型的输出文档字符串和起始文档字符串添加到类上
@add_start_docstrings(
    "SuperPoint model outputting keypoints and descriptors.",
    SUPERPOINT_START_DOCSTRING,
)
# 定义了一个超级点（SuperPoint）关键点检测模型类，继承自 `SuperPointPreTrainedModel`
class SuperPointForKeypointDetection(SuperPointPreTrainedModel):
    """
    SuperPoint model. It consists of a SuperPointEncoder, a SuperPointInterestPointDecoder and a
    SuperPointDescriptorDecoder. SuperPoint was proposed in `SuperPoint: Self-Supervised Interest Point Detection and
    Description <https://arxiv.org/abs/1712.07629>`__ by Daniel DeTone, Tomasz Malisiewicz, and Andrew Rabinovich. It
    is a fully convolutional neural network that extracts keypoints and descriptors from an image. It is trained in a
    self-supervised manner, using a combination of a photometric loss and a loss based on the homographic adaptation of
    keypoints. It is made of a convolutional encoder and two decoders: one for keypoints and one for descriptors.
    """

    # 初始化方法，接受一个 `SuperPointConfig` 类型的参数配置，并调用父类的初始化方法
    def __init__(self, config: SuperPointConfig) -> None:
        super().__init__(config)

        # 将配置参数保存到实例变量中
        self.config = config

        # 创建超级点编码器、关键点解码器和描述符解码器实例
        self.encoder = SuperPointEncoder(config)
        self.keypoint_decoder = SuperPointInterestPointDecoder(config)
        self.descriptor_decoder = SuperPointDescriptorDecoder(config)

        # 调用初始化后处理方法
        self.post_init()

    # 使用装饰器 `add_start_docstrings_to_model_forward` 将输入文档字符串添加到模型的 `forward` 方法
    @add_start_docstrings_to_model_forward(SUPERPOINT_INPUTS_DOCSTRING)
    # 前向传播方法，接收像素值、标签、是否返回隐藏状态和是否返回字典作为参数
    def forward(
        self,
        pixel_values: torch.FloatTensor = None,
        labels: Optional[torch.LongTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # ...

`.\models\superpoint\init.py`

# 版权声明和许可证信息
# 版权所有 2024 年 HuggingFace 团队保留所有权利。
# 
# 根据 Apache 许可证 2.0 版本（“许可证”）获得许可；除非符合许可证的规定，否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本：
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# 除非适用法律要求或书面同意，否则依据许可证分发的软件是基于“按原样”提供的，无任何明示或暗示的担保或条件。
# 请参阅许可证获取具体的语言和权限。
from typing import TYPE_CHECKING

# 从 HuggingFace 的 utils 模块导入相关内容
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义模块导入结构字典
_import_structure = {
    "configuration_superpoint": [
        "SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "SuperPointConfig",
    ]
}

# 检查视觉模块是否可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将 SuperPointImageProcessor 导入到 image_processing_superpoint 模块中
    _import_structure["image_processing_superpoint"] = ["SuperPointImageProcessor"]

# 检查 torch 是否可用，如果不可用则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将一些 SuperPoint 模型相关的类和常量导入到 modeling_superpoint 模块中
    _import_structure["modeling_superpoint"] = [
        "SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SuperPointForKeypointDetection",
        "SuperPointPreTrainedModel",
    ]

# 如果是类型检查模式，导入配置和模型相关内容
if TYPE_CHECKING:
    from .configuration_superpoint import (
        SUPERPOINT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        SuperPointConfig,
    )

    # 如果视觉模块可用，导入图像处理相关的内容
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .image_processing_superpoint import SuperPointImageProcessor

    # 如果 torch 可用，导入模型相关的内容
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_superpoint import (
            SUPERPOINT_PRETRAINED_MODEL_ARCHIVE_LIST,
            SuperPointForKeypointDetection,
            SuperPointPreTrainedModel,
        )

# 如果不是类型检查模式，设置模块为 LazyModule
else:
    import sys

    # 将当前模块设为 LazyModule，支持按需导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\swiftformer\configuration_swiftformer.py`

# coding=utf-8
# 版权所有 2023 MBZUAI 和 The HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证，否则您不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，软件按“原样”分发，
# 没有任何明示或暗示的保证或条件。
# 有关特定语言的权限，请参阅许可证。
""" SwiftFormer 模型配置 """

from collections import OrderedDict  # 导入有序字典类
from typing import Mapping  # 导入映射类型

from packaging import version  # 导入版本包

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入ONNX配置类
from ...utils import logging  # 导入日志工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

# SwiftFormer 预训练配置文件映射，指定了模型的预训练配置文件
SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "MBZUAI/swiftformer-xs": "https://huggingface.co/MBZUAI/swiftformer-xs/resolve/main/config.json",
}


class SwiftFormerConfig(PretrainedConfig):
    r"""
    这是配置类，用于存储 [`SwiftFormerModel`] 的配置。根据指定的参数实例化一个 SwiftFormer 模型，定义模型的体系结构。
    使用默认值实例化配置将产生与 SwiftFormer [MBZUAI/swiftformer-xs](https://huggingface.co/MBZUAI/swiftformer-xs) 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型的输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。
    """
    # 定义模型类型为 "swiftformer"
    model_type = "swiftformer"
    
    # 初始化 SwiftFormerConfig 类，设定模型的各项配置参数
    def __init__(
        self,
        num_channels=3,  # 输入通道数，默认为 3
        depths=[3, 3, 6, 4],  # 每个阶段的深度列表，默认为 [3, 3, 6, 4]
        embed_dims=[48, 56, 112, 220],  # 每个阶段的嵌入维度列表，默认为 [48, 56, 112, 220]
        mlp_ratio=4,  # MLP 隐藏层维度与输入维度之比，默认为 4
        downsamples=[True, True, True, True],  # 每个阶段是否进行下采样的布尔列表，默认为 [True, True, True, True]
        hidden_act="gelu",  # 非线性激活函数类型，默认为 "gelu"
        down_patch_size=3,  # 下采样层的补丁大小，默认为 3
        down_stride=2,  # 下采样层卷积核的步幅，默认为 2
        down_pad=1,  # 下采样层的填充大小，默认为 1
        drop_path_rate=0.0,  # DropPath 增加 dropout 概率的比率，默认为 0.0
        use_layer_scale=True,  # 是否对来自令牌混合器的输出进行缩放，默认为 True
        layer_scale_init_value=1e-5,  # 令牌混合器输出缩放的初始值，默认为 1e-5
        batch_norm_eps=1e-5,  # 批量归一化层使用的 epsilon 值，默认为 1e-5
        **kwargs,  # 其他未命名参数，用于接收额外的配置参数
    ):
        ):
        # 调用父类的初始化方法，传入所有的关键字参数
        super().__init__(**kwargs)
        # 设置当前对象的通道数
        self.num_channels = num_channels
        # 设置深度信息
        self.depths = depths
        # 设置嵌入维度信息
        self.embed_dims = embed_dims
        # 设置MLP的比率
        self.mlp_ratio = mlp_ratio
        # 设置下采样信息
        self.downsamples = downsamples
        # 设置隐藏层激活函数
        self.hidden_act = hidden_act
        # 设置下采样的补丁大小
        self.down_patch_size = down_patch_size
        # 设置下采样的步长
        self.down_stride = down_stride
        # 设置下采样的填充
        self.down_pad = down_pad
        # 设置丢弃路径率
        self.drop_path_rate = drop_path_rate
        # 是否使用层尺度
        self.use_layer_scale = use_layer_scale
        # 初始化层尺度的值
        self.layer_scale_init_value = layer_scale_init_value
        # 设置批归一化的eps值
        self.batch_norm_eps = batch_norm_eps
# 定义 SwiftFormerOnnxConfig 类，继承自 OnnxConfig 类
class SwiftFormerOnnxConfig(OnnxConfig):
    # 设定 torch_onnx_minimum_version 属性，要求最低版本为 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # 定义 inputs 属性，返回一个有序字典，用于描述模型输入的结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                # 描述模型输入的具体信息，包括像素值和对应的维度顺序
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # 定义 atol_for_validation 属性，返回一个浮点数，表示验证过程中的绝对误差容忍度
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

`.\models\swiftformer\convert_swiftformer_original_to_hf.py`

# coding=utf-8
# 定义脚本的编码格式为 UTF-8

# Copyright 2023 The HuggingFace Inc. team.
# 版权声明，版权归 HuggingFace Inc. 团队所有

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证版本 2.0 进行许可

# you may not use this file except in compliance with the License.
# 除非遵守许可证的规定，否则不得使用此文件。

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本

#     http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# 除非适用法律要求或书面同意，否则根据许可证分发的软件是基于“按原样”提供的。

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有明示或暗示的任何保证或条件。

# See the License for the specific language governing permissions and
# limitations under the License.
# 请参阅许可证，了解特定语言的权限和限制。

"""Convert SwiftFormer checkpoints from the original implementation."""

import argparse
import json
from pathlib import Path

import requests
import torch
from huggingface_hub import hf_hub_download
from PIL import Image

from transformers import (
    SwiftFormerConfig,
    SwiftFormerForImageClassification,
    ViTImageProcessor,
)
from transformers.utils import logging

logging.set_verbosity_info()
# 设置日志记录的详细程度为信息级别

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

device = torch.device("cpu")
# 设置设备为 CPU

# We will verify our results on an image of cute cats
# 我们将在一张可爱猫咪的图片上验证我们的结果
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    # 定义图像的 URL
    im = Image.open(requests.get(url, stream=True).raw)
    # 从 URL 获取图像，并打开为 PIL 图像对象
    return im

def get_expected_output(swiftformer_name):
    # 根据 SwiftFormer 模型名称返回预期的输出
    if swiftformer_name == "swiftformer_xs":
        return torch.tensor([-2.1703e00, 2.1107e00, -2.0811e00, 8.8685e-01, 2.4360e-01])

    elif swiftformer_name == "swiftformer_s":
        return torch.tensor([3.9636e-01, 2.3478e-01, -1.6963e00, -1.7381e00, -8.6337e-01])

    elif swiftformer_name == "swiftformer_l1":
        return torch.tensor([-4.2768e-01, -4.7429e-01, -1.0897e00, -1.0248e00, 3.5523e-02])

    elif swiftformer_name == "swiftformer_l3":
        return torch.tensor([-2.5330e-01, 2.4211e-01, -6.0185e-01, -8.2789e-01, -6.0446e-02])

def rename_key(dct, old, new):
    # 将字典 dct 中的键 old 重命名为 new
    val = dct.pop(old)
    dct[new] = val

def create_rename_keys(state_dict):
    # 根据模型的 state_dict 创建重命名映射表
    rename_keys = []
    for k in state_dict.keys():
        k_new = k
        if ".pwconv" in k:
            k_new = k_new.replace(".pwconv", ".point_wise_conv")
        if ".dwconv" in k:
            k_new = k_new.replace(".dwconv", ".depth_wise_conv")
        if ".Proj." in k:
            k_new = k_new.replace(".Proj.", ".proj.")
        if "patch_embed" in k_new:
            k_new = k_new.replace("patch_embed", "swiftformer.patch_embed.patch_embedding")
        if "network" in k_new:
            ls = k_new.split(".")
            if ls[2].isdigit():
                k_new = "swiftformer.encoder.network." + ls[1] + ".blocks." + ls[2] + "." + ".".join(ls[3:])
            else:
                k_new = k_new.replace("network", "swiftformer.encoder.network")
        rename_keys.append((k, k_new))
    return rename_keys

@torch.no_grad()
# 使用 torch.no_grad() 修饰，表明下方的函数不需要进行梯度计算

def convert_swiftformer_checkpoint(swiftformer_name, pytorch_dump_folder_path, original_ckpt):
    """
    根据指定的 SwiftFormer 模型名称，转换原始的检查点文件到 PyTorch 格式。

    Args:
        swiftformer_name (str): SwiftFormer 模型名称
        pytorch_dump_folder_path (str): 转换后的 PyTorch 检查点保存路径
        original_ckpt (str): 原始的 SwiftFormer 检查点文件路径
    """
    Copy/paste/tweak model's weights to our SwiftFormer structure.
    """

    # 定义默认的 SwiftFormer 配置对象
    config = SwiftFormerConfig()

    # 设置模型的类别数为 1000
    config.num_labels = 1000
    # 定义 Hugging Face Hub 中的资源库 ID 和文件名
    repo_id = "huggingface/label-files"
    filename = "imagenet-1k-id2label.json"
    # 从 Hugging Face Hub 下载并加载类别映射文件，转换为整数映射到标签名的字典
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label
    # 根据 id2label 字典生成 label 到 id 的反向映射字典
    config.label2id = {v: k for k, v in id2label.items()}

    # 根据不同的 SwiftFormer 模型名配置模型的深度和嵌入维度
    if swiftformer_name == "swiftformer_xs":
        config.depths = [3, 3, 6, 4]
        config.embed_dims = [48, 56, 112, 220]

    elif swiftformer_name == "swiftformer_s":
        config.depths = [3, 3, 9, 6]
        config.embed_dims = [48, 64, 168, 224]

    elif swiftformer_name == "swiftformer_l1":
        config.depths = [4, 3, 10, 5]
        config.embed_dims = [48, 96, 192, 384]

    elif swiftformer_name == "swiftformer_l3":
        config.depths = [4, 4, 12, 6]
        config.embed_dims = [64, 128, 320, 512]

    # 如果提供了原始模型的检查点路径，则加载其状态字典并进行重命名处理
    if original_ckpt:
        if original_ckpt.startswith("https"):
            # 从 URL 加载模型状态字典
            checkpoint = torch.hub.load_state_dict_from_url(original_ckpt, map_location="cpu", check_hash=True)
        else:
            # 从本地文件加载模型状态字典
            checkpoint = torch.load(original_ckpt, map_location="cpu")
    state_dict = checkpoint

    # 根据预定义规则，创建重命名映射关系并对状态字典进行重命名
    rename_keys = create_rename_keys(state_dict)
    for rename_key_src, rename_key_dest in rename_keys:
        rename_key(state_dict, rename_key_src, rename_key_dest)

    # 加载 SwiftFormer 模型并载入处理后的状态字典
    hf_model = SwiftFormerForImageClassification(config).eval()
    hf_model.load_state_dict(state_dict)

    # 准备测试输入图像和预处理器
    image = prepare_img()
    processor = ViTImageProcessor.from_pretrained("preprocessor_config")
    inputs = processor(images=image, return_tensors="pt")

    # 获取预期输出结果，用于与 HuggingFace 模型输出进行比较
    timm_logits = get_expected_output(swiftformer_name)
    hf_logits = hf_model(inputs["pixel_values"]).logits

    # 断言检查 HuggingFace 模型输出的形状和预期的一致性
    assert hf_logits.shape == torch.Size([1, 1000])
    assert torch.allclose(hf_logits[0, 0:5], timm_logits, atol=1e-3)

    # 确保 PyTorch 导出文件夹存在，保存 SwiftFormer 模型
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    print(f"Saving model {swiftformer_name} to {pytorch_dump_folder_path}")
    hf_model.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--swiftformer_name",
        default="swiftformer_xs",
        choices=["swiftformer_xs", "swiftformer_s", "swiftformer_l1", "swiftformer_l3"],
        type=str,
        help="Name of the SwiftFormer model you'd like to convert.",
    )
    # 添加一个必需的参数：SwiftFormer 模型的名称，可以选择默认为 "swiftformer_xs"
    # 允许的取值为预定义的几种模型名称
    # 参数类型为字符串，帮助信息描述了希望转换的 SwiftFormer 模型的名称

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default="./converted_outputs/",
        type=str,
        help="Path to the output PyTorch model directory.",
    )
    # 添加一个参数：输出 PyTorch 模型的目录路径，默认为当前目录下的 "converted_outputs/"
    # 参数类型为字符串，描述了输出 PyTorch 模型的保存路径

    parser.add_argument("--original_ckpt", default=None, type=str, help="Path to the original model checkpoint.")
    # 添加一个参数：原始模型检查点的路径，默认为 None
    # 参数类型为字符串，描述了原始模型检查点文件的路径

    args = parser.parse_args()
    # 解析命令行参数并存储到 args 对象中

    convert_swiftformer_checkpoint(args.swiftformer_name, args.pytorch_dump_folder_path, args.original_ckpt)
    # 调用函数 convert_swiftformer_checkpoint，传递解析后的参数：
    #   - SwiftFormer 模型名称
    #   - 输出的 PyTorch 模型目录路径
    #   - 原始模型检查点路径

`.\models\swiftformer\modeling_swiftformer.py`

# 设置编码格式为 UTF-8

# 版权声明和许可协议信息
# Copyright 2023 MBZUAI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
PyTorch SwiftFormer model.
"""

import collections.abc
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入自定义模块和类
from ...activations import ACT2CLS
from ...modeling_outputs import (
    BaseModelOutputWithNoAttention,
    ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_swiftformer import SwiftFormerConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 用于文档的配置参数说明
_CONFIG_FOR_DOC = "SwiftFormerConfig"

# 用于文档的检查点说明
_CHECKPOINT_FOR_DOC = "MBZUAI/swiftformer-xs"

# 预期输出形状的说明
_EXPECTED_OUTPUT_SHAPE = [1, 220, 7, 7]

# 图像分类任务的检查点说明
_IMAGE_CLASS_CHECKPOINT = "MBZUAI/swiftformer-xs"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"

# 预训练模型的存档列表
SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "MBZUAI/swiftformer-xs",
    # 查看所有 SwiftFormer 模型，请访问 https://huggingface.co/models?filter=swiftformer
]

class SwiftFormerPatchEmbedding(nn.Module):
    """
    Patch Embedding Layer constructed of two 2D convolutional layers.

    输入: 形状为 `[batch_size, in_channels, height, width]` 的张量

    输出: 形状为 `[batch_size, out_channels, height/4, width/4]` 的张量
    """

    def __init__(self, config: SwiftFormerConfig):
        super().__init__()

        in_chs = config.num_channels
        out_chs = config.embed_dims[0]
        
        # 定义补丁嵌入层，包括两个二维卷积层
        self.patch_embedding = nn.Sequential(
            nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(out_chs // 2, eps=config.batch_norm_eps),
            nn.ReLU(),
            nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(out_chs, eps=config.batch_norm_eps),
            nn.ReLU(),
        )

    def forward(self, x):
        # 执行补丁嵌入层的前向传播
        return self.patch_embedding(x)

# 从 transformers.models.beit.modeling_beit.drop_path 复制过来的函数
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    按样本（在残差块的主路径中应用）随机删除路径（随机深度）。

    Args:
        input (torch.Tensor): 输入张量
        drop_prob (float, optional): 删除概率，默认为 0.0
        training (bool, optional): 是否在训练模式中，默认为 False

    Returns:
        torch.Tensor: 处理后的张量
    """
    # 如果 drop_prob 为 0.0 或者不处于训练模式，则直接返回输入 input，不进行 Dropout
    if drop_prob == 0.0 or not training:
        return input
    
    # 计算保留节点的概率
    keep_prob = 1 - drop_prob
    
    # 根据输入张量的形状，创建一个随机张量，用于决定每个节点是否保留
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 适用于不同维度的张量，而不仅仅是二维卷积网络
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化（取整）
    
    # 将输入张量除以 keep_prob，然后乘以随机张量，以实现 Dropout 操作
    output = input.div(keep_prob) * random_tensor
    
    # 返回经过 Dropout 后的输出张量
    return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Swiftformer
class SwiftFormerDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数，用于在训练时按照给定的概率丢弃部分隐藏状态
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回描述当前实例的字符串，包括 drop_prob 的概率值
        return "p={}".format(self.drop_prob)


class SwiftFormerEmbeddings(nn.Module):
    """
    Embeddings layer consisting of a single 2D convolutional and batch normalization layer.

    Input: tensor of shape `[batch_size, channels, height, width]`

    Output: tensor of shape `[batch_size, channels, height/stride, width/stride]`
    """

    def __init__(self, config: SwiftFormerConfig, index: int):
        super().__init__()

        # 从配置中获取所需的参数
        patch_size = config.down_patch_size
        stride = config.down_stride
        padding = config.down_pad
        embed_dims = config.embed_dims

        # 获取输入和输出通道数
        in_chans = embed_dims[index]
        embed_dim = embed_dims[index + 1]

        # 确保 patch_size、stride 和 padding 是可迭代对象
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
        padding = padding if isinstance(padding, collections.abc.Iterable) else (padding, padding)

        # 定义卷积和批量归一化层
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps)

    def forward(self, x):
        # 前向传播过程，依次通过卷积和批量归一化层处理输入张量 x
        x = self.proj(x)
        x = self.norm(x)
        return x


class SwiftFormerConvEncoder(nn.Module):
    """
    `SwiftFormerConvEncoder` with 3*3 and 1*1 convolutions.

    Input: tensor of shape `[batch_size, channels, height, width]`

    Output: tensor of shape `[batch_size, channels, height, width]`
    """

    def __init__(self, config: SwiftFormerConfig, dim: int):
        super().__init__()
        hidden_dim = int(config.mlp_ratio * dim)

        # 定义深度可分离卷积、批量归一化、点卷积层和激活函数
        self.depth_wise_conv = nn.Conv2d(dim, dim, kernel_size=3, padding=1, groups=dim)
        self.norm = nn.BatchNorm2d(dim, eps=config.batch_norm_eps)
        self.point_wise_conv1 = nn.Conv2d(dim, hidden_dim, kernel_size=1)
        self.act = nn.GELU()
        self.point_wise_conv2 = nn.Conv2d(hidden_dim, dim, kernel_size=1)
        self.drop_path = nn.Identity()  # 默认情况下不应用 drop path
        self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)

    def forward(self, x):
        input = x
        # 执行深度可分离卷积、批量归一化、点卷积和激活函数
        x = self.depth_wise_conv(x)
        x = self.norm(x)
        x = self.point_wise_conv1(x)
        x = self.act(x)
        x = self.point_wise_conv2(x)
        # 应用 drop path 和缩放因子到输入
        x = input + self.drop_path(self.layer_scale * x)
        return x


class SwiftFormerMlp(nn.Module):
    """
    """
    MLP layer with 1*1 convolutions.

    Input: tensor of shape `[batch_size, channels, height, width]`

    Output: tensor of shape `[batch_size, channels, height, width]`
    """

    # 初始化函数，接受配置对象和输入特征数
    def __init__(self, config: SwiftFormerConfig, in_features: int):
        super().__init__()  # 调用父类的构造函数
        hidden_features = int(in_features * config.mlp_ratio)  # 计算隐藏层特征数
        self.norm1 = nn.BatchNorm2d(in_features, eps=config.batch_norm_eps)  # 批量归一化层
        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)  # 第一个卷积层，1x1卷积
        act_layer = ACT2CLS[config.hidden_act]  # 获取激活函数类
        self.act = act_layer()  # 实例化激活函数对象
        self.fc2 = nn.Conv2d(hidden_features, in_features, 1)  # 第二个卷积层，1x1卷积
        self.drop = nn.Dropout(p=0.0)  # Dropout层，概率为0.0，即不进行dropout操作

    # 前向传播函数，接受输入张量x
    def forward(self, x):
        x = self.norm1(x)  # 应用批量归一化
        x = self.fc1(x)  # 第一个卷积层的计算
        x = self.act(x)  # 应用激活函数
        x = self.drop(x)  # 应用dropout
        x = self.fc2(x)  # 第二个卷积层的计算
        x = self.drop(x)  # 再次应用dropout
        return x  # 返回计算结果张量
class SwiftFormerEncoderBlock(nn.Module):
    """
    SwiftFormer Encoder Block for SwiftFormer. It consists of (1) Local representation module, (2)
    SwiftFormerEfficientAdditiveAttention, and (3) MLP block.

    Input: tensor of shape `[batch_size, channels, height, width]`

    Output: tensor of shape `[batch_size, channels,height, width]`
    """

    def __init__(self, config: SwiftFormerConfig, dim: int):
        super().__init__()

        # 定义局部表示模块，使用3*3深度卷积和点卷积
        self.local_representations = SwiftFormerLocalRepresentation(config, dim)
        
        # 定义注意力模块，使用高效加性注意力
        self.attention = SwiftFormerEfficientAdditiveAttention(config, dim)
        
        # 定义MLP块
        self.mlp_block = nn.Sequential(
            nn.Linear(dim, 4 * dim),
            nn.GELU(),
            nn.Linear(4 * dim, dim),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        # 应用局部表示模块
        x = self.local_representations(x)
        
        # 应用注意力模块
        x = self.attention(x)
        
        # 应用MLP块
        x = self.mlp_block(x)
        
        return x
    def __init__(self, config: SwiftFormerConfig, dim: int, drop_path: float = 0.0) -> None:
        super().__init__()

        # 从配置对象中获取层缩放初始化值和是否使用层缩放的标志
        layer_scale_init_value = config.layer_scale_init_value
        use_layer_scale = config.use_layer_scale

        # 创建本地表示层、注意力层、线性层和DropPath层
        self.local_representation = SwiftFormerLocalRepresentation(config, dim=dim)
        self.attn = SwiftFormerEfficientAdditiveAttention(config, dim=dim)
        self.linear = SwiftFormerMlp(config, in_features=dim)
        self.drop_path = SwiftFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
        self.use_layer_scale = use_layer_scale

        # 如果使用层缩放，则创建两个层缩放参数
        if use_layer_scale:
            self.layer_scale_1 = nn.Parameter(
                layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
            )
            self.layer_scale_2 = nn.Parameter(
                layer_scale_init_value * torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True
            )

    def forward(self, x):
        # 将输入x传递给本地表示层处理
        x = self.local_representation(x)
        batch_size, channels, height, width = x.shape
        
        # 如果使用层缩放，则将层缩放因子应用于注意力层和线性层的输出
        if self.use_layer_scale:
            # 计算注意力层的输出，并应用层缩放因子和DropPath层
            x = x + self.drop_path(
                self.layer_scale_1
                * self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
                .reshape(batch_size, height, width, channels)
                .permute(0, 3, 1, 2)
            )
            # 计算线性层的输出，并应用层缩放因子和DropPath层
            x = x + self.drop_path(self.layer_scale_2 * self.linear(x))

        else:
            # 如果不使用层缩放，则直接应用注意力层和线性层的输出与DropPath层
            x = x + self.drop_path(
                self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
                .reshape(batch_size, height, width, channels)
                .permute(0, 3, 1, 2)
            )
            x = x + self.drop_path(self.linear(x))
        
        # 返回处理后的输出张量x
        return x
class SwiftFormerStage(nn.Module):
    """
    A Swiftformer stage consisting of a series of `SwiftFormerConvEncoder` blocks and a final
    `SwiftFormerEncoderBlock`.

    Input: tensor in shape `[batch_size, channels, height, width]`

    Output: tensor in shape `[batch_size, channels, height, width]`
    """

    def __init__(self, config: SwiftFormerConfig, index: int) -> None:
        super().__init__()

        layer_depths = config.depths
        dim = config.embed_dims[index]
        depth = layer_depths[index]

        blocks = []
        for block_idx in range(depth):
            # 计算当前 block 的 drop path rate
            block_dpr = config.drop_path_rate * (block_idx + sum(layer_depths[:index])) / (sum(layer_depths) - 1)

            if depth - block_idx <= 1:
                # 如果是最后一个 block，则添加 SwiftFormerEncoderBlock
                blocks.append(SwiftFormerEncoderBlock(config, dim=dim, drop_path=block_dpr))
            else:
                # 否则添加 SwiftFormerConvEncoder
                blocks.append(SwiftFormerConvEncoder(config, dim=dim))

        self.blocks = nn.ModuleList(blocks)

    def forward(self, input):
        # 依次通过所有的 block 进行前向传播
        for block in self.blocks:
            input = block(input)
        return input


class SwiftFormerEncoder(nn.Module):
    def __init__(self, config: SwiftFormerConfig) -> None:
        super().__init__()
        self.config = config

        embed_dims = config.embed_dims
        downsamples = config.downsamples
        layer_depths = config.depths

        # Transformer model
        network = []
        for i in range(len(layer_depths)):
            # 创建 SwiftFormerStage，并将其添加到网络中
            stage = SwiftFormerStage(config=config, index=i)
            network.append(stage)
            if i >= len(layer_depths) - 1:
                break
            if downsamples[i] or embed_dims[i] != embed_dims[i + 1]:
                # 如果需要下采样或者维度变化，则添加 SwiftFormerEmbeddings
                network.append(SwiftFormerEmbeddings(config, index=i))
        self.network = nn.ModuleList(network)

        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        all_hidden_states = (hidden_states,) if output_hidden_states else None

        for block in self.network:
            # 依次通过所有的 block 进行前向传播
            hidden_states = block(hidden_states)
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        # 返回 BaseModelOutputWithNoAttention 对象
        return BaseModelOutputWithNoAttention(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )


class SwiftFormerPreTrainedModel(PreTrainedModel):
    """
    This class is not completed in the provided snippet.
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 使用 SwiftFormerConfig 类作为配置类
    config_class = SwiftFormerConfig
    # 模型的基础名称前缀为 "swiftformer"
    base_model_prefix = "swiftformer"
    # 主输入的名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
        """Initialize the weights"""
        # 如果模块是线性层或者二维卷积层，使用截断正态分布初始化权重
        if isinstance(module, (nn.Conv2d, nn.Linear)):
            nn.init.trunc_normal_(module.weight, std=0.02)
            # 如果存在偏置项，则初始化为常数 0
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
        # 如果模块是层归一化层，则初始化偏置项为常数 0，权重为常数 1.0
        elif isinstance(module, (nn.LayerNorm)):
            nn.init.constant_(module.bias, 0)
            nn.init.constant_(module.weight, 1.0)
# SWIFTFORMER_START_DOCSTRING 变量，包含了 SwiftFormerModel 类的文档字符串，描述了这个模型是一个 PyTorch 的 nn.Module 子类，
# 可以像普通的 PyTorch 模块一样使用，详细的使用和行为相关信息可以查阅 PyTorch 文档。

# SWIFTFORMER_INPUTS_DOCSTRING 变量，包含了 SwiftFormerModel 类的输入文档字符串，描述了模型的输入参数和返回值。
# pixel_values 参数是一个 torch.FloatTensor，表示像素值，形状为 (batch_size, num_channels, height, width)。
# output_hidden_states 参数是一个可选的布尔值，指定是否返回所有层的隐藏状态。
# return_dict 参数是一个可选的布尔值，指定是否返回一个 ModelOutput 对象而不是简单的元组。

# 使用 add_start_docstrings 装饰器为 SwiftFormerModel 类添加了一个开头的文档字符串，描述了它是一个输出原始隐藏状态的
# SwiftFormer 模型变压器，没有特定的顶部头。

class SwiftFormerModel(SwiftFormerPreTrainedModel):
    def __init__(self, config: SwiftFormerConfig):
        # 调用 SwiftFormerPreTrainedModel 的初始化方法，并传入配置对象 config
        super().__init__(config)
        # 将传入的配置对象保存到 self.config 中
        self.config = config

        # 创建 SwiftFormerPatchEmbedding 对象并保存到 self.patch_embed
        self.patch_embed = SwiftFormerPatchEmbedding(config)
        
        # 创建 SwiftFormerEncoder 对象并保存到 self.encoder
        self.encoder = SwiftFormerEncoder(config)

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用 add_start_docstrings_to_model_forward 装饰器为 forward 方法添加了开头的文档字符串，
    # 描述了方法的输入参数和输出预期，模型用于 vision 模态。

    # forward 方法，接收 pixel_values, output_hidden_states 和 return_dict 作为输入参数
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithNoAttention]:
        r""" """
        # 设置函数签名，指定返回类型为元组或BaseModelOutputWithNoAttention类的实例

        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果output_hidden_states不为None，则使用其自身值；否则使用self.config.output_hidden_states的值

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果return_dict不为None，则使用其自身值；否则使用self.config.use_return_dict的值

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")
        # 如果pixel_values为None，则抛出值错误异常，要求指定pixel_values

        embedding_output = self.patch_embed(pixel_values)
        # 使用self.patch_embed方法对pixel_values进行嵌入编码

        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 调用self.encoder方法对嵌入输出进行编码，传入参数为embedding_output、output_hidden_states和return_dict

        if not return_dict:
            return tuple(v for v in encoder_outputs if v is not None)
        # 如果return_dict为False，则返回encoder_outputs中所有非None值的元组

        return BaseModelOutputWithNoAttention(
            last_hidden_state=encoder_outputs.last_hidden_state,
            hidden_states=encoder_outputs.hidden_states,
        )
        # 使用BaseModelOutputWithNoAttention类创建一个实例，传入encoder_outputs的最后隐藏状态和隐藏状态列表作为参数
# 使用自定义的文档字符串装饰器为类添加描述信息，指定其是基于SwiftFormer模型的图像分类器
@add_start_docstrings(
    """
    SwiftFormer Model transformer with an image classification head on top (e.g. for ImageNet).
    """,
    SWIFTFORMER_START_DOCSTRING,
)
# 声明SwiftFormerForImageClassification类，继承自SwiftFormerPreTrainedModel
class SwiftFormerForImageClassification(SwiftFormerPreTrainedModel):
    
    # 初始化方法，接受一个SwiftFormerConfig类型的参数config，并调用其父类的初始化方法
    def __init__(self, config: SwiftFormerConfig) -> None:
        super().__init__(config)

        # 从config中获取嵌入维度
        embed_dims = config.embed_dims

        # 设置类别数量为config中指定的类别数
        self.num_labels = config.num_labels
        # 创建SwiftFormerModel模型实例，并赋值给self.swiftformer
        self.swiftformer = SwiftFormerModel(config)

        # 分类器头部
        # 根据最后一个嵌入维度设置批量归一化层
        self.norm = nn.BatchNorm2d(embed_dims[-1], eps=config.batch_norm_eps)
        # 如果有类别数量大于0，则创建线性层作为分类头部，否则创建一个恒等映射（nn.Identity()）
        self.head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()
        # 同上，创建一个用于距离度量的线性层或者恒等映射
        self.dist_head = nn.Linear(embed_dims[-1], self.num_labels) if self.num_labels > 0 else nn.Identity()

        # 初始化权重并应用最终处理
        self.post_init()

    # 使用自定义的文档字符串装饰器为forward方法添加描述信息，指定其输入和输出类型
    @add_start_docstrings_to_model_forward(SWIFTFORMER_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=ImageClassifierOutputWithNoAttention,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    # 定义forward方法，接受像素值pixel_values、标签labels以及其他可选参数，并返回一个字典或者张量，具体依赖于return_dict参数
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 函数声明未完成，余下的代码在下一个注释中
        ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不为 None，则使用传入的 return_dict；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 运行基础模型，将输入的像素值传递给 Swiftformer 模型
        outputs = self.swiftformer(
            pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果 return_dict 为 False，则从 outputs 中获取最后一个隐藏状态；否则从 outputs 的第一个元素获取序列输出
        sequence_output = outputs.last_hidden_state if return_dict else outputs[0]

        # 将序列输出应用归一化操作
        sequence_output = self.norm(sequence_output)

        # 将归一化后的序列输出展平，然后在第二个维度上取平均值
        sequence_output = sequence_output.flatten(2).mean(-1)

        # 将平均值后的序列输出传递给分类头部模型和蒸馏头部模型
        cls_out = self.head(sequence_output)
        distillation_out = self.dist_head(sequence_output)

        # 计算 logits，即分类头部模型输出和蒸馏头部模型输出的平均值
        logits = (cls_out + distillation_out) / 2

        # 计算损失值
        loss = None
        if labels is not None:
            # 根据配置文件中的问题类型确定损失函数的类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择相应的损失函数
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回 logits 和其他输出；否则返回 ImageClassifierOutputWithNoAttention 对象
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return ImageClassifierOutputWithNoAttention(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )

`.\models\swiftformer\init.py`

# 引入必要的模块和依赖项来支持 SwiftFormer 模型的配置和建模
from typing import TYPE_CHECKING

# 从工具包中引入异常类和模块懒加载工具
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构，包括配置和建模相关内容
_import_structure = {
    "configuration_swiftformer": [
        "SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "SwiftFormerConfig",
        "SwiftFormerOnnxConfig",
    ]
}

# 检查是否存在 torch 库，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，则添加建模相关内容到导入结构中
    _import_structure["modeling_swiftformer"] = [
        "SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "SwiftFormerForImageClassification",
        "SwiftFormerModel",
        "SwiftFormerPreTrainedModel",
    ]

# 如果 TYPE_CHECKING 为真，则从 configuration_swiftformer 模块中导入特定的配置类和映射
if TYPE_CHECKING:
    from .configuration_swiftformer import (
        SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
        SwiftFormerConfig,
        SwiftFormerOnnxConfig,
    )

    # 再次检查 torch 是否可用，若不可用则抛出异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，则从 modeling_swiftformer 模块导入建模相关类
        from .modeling_swiftformer import (
            SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            SwiftFormerForImageClassification,
            SwiftFormerModel,
            SwiftFormerPreTrainedModel,
        )

# 如果 TYPE_CHECKING 为假，则导入 sys 模块并将当前模块设为 _LazyModule 的懒加载实例
else:
    import sys

    # 将当前模块定义为 _LazyModule 的实例，以支持懒加载模块的导入
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\swin\configuration_swin.py`

# coding=utf-8
# 版权 2022 年 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证版本 2.0 许可，您可以不使用此文件，除非遵守许可。
# 您可以在以下地址获取许可的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，
# 没有任何明示或暗示的保证或条件。
# 请查阅许可证获取更多信息。
""" Swin Transformer 模型配置"""

from collections import OrderedDict  # 导入有序字典类
from typing import Mapping  # 导入映射类型

from packaging import version  # 导入版本控制库

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入ONNX配置类
from ...utils import logging  # 导入日志工具
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices  # 导入背骨结构工具函数


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/swin-tiny-patch4-window7-224": (
        "https://huggingface.co/microsoft/swin-tiny-patch4-window7-224/resolve/main/config.json"
    ),
    # 查看所有 Swin 模型，请访问 https://huggingface.co/models?filter=swin
}


class SwinConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    这是一个配置类，用于存储 [`SwinModel`] 的配置。它用于根据指定的参数实例化 Swin 模型，定义模型的架构。
    使用默认值实例化配置将产生类似于 Swin [microsoft/swin-tiny-patch4-window7-224](https://huggingface.co/microsoft/swin-tiny-patch4-window7-224)
    架构的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。

    示例:

    ```
    >>> from transformers import SwinConfig, SwinModel
    >>> # 初始化一个 Swin microsoft/swin-tiny-patch4-window7-224 风格的配置
    >>> configuration = SwinConfig()
    >>> # 从 microsoft/swin-tiny-patch4-window7-224 风格的配置初始化一个模型（带有随机权重）
    >>> model = SwinModel(configuration)
    >>> # 访问模型配置
    >>> configuration = model.config
    ```
    """

    model_type = "swin"

    attribute_map = {
        "num_attention_heads": "num_heads",  # 属性映射，注意力头数映射到 num_heads
        "num_hidden_layers": "num_layers",   # 属性映射，隐藏层数映射到 num_layers
    }
    # 定义一个初始化函数，用于初始化Swin Transformer模型的各种参数
    def __init__(
        self,
        image_size=224,  # 图像尺寸，默认为224
        patch_size=4,  # 每个patch的大小，默认为4
        num_channels=3,  # 输入图像的通道数，默认为3（RGB图像）
        embed_dim=96,  # 嵌入维度，默认为96
        depths=[2, 2, 6, 2],  # 各个阶段的深度列表，默认为[2, 2, 6, 2]
        num_heads=[3, 6, 12, 24],  # 各个阶段的注意力头数列表，默认为[3, 6, 12, 24]
        window_size=7,  # 窗口大小，默认为7
        mlp_ratio=4.0,  # MLP扩展比例，默认为4.0
        qkv_bias=True,  # 是否使用QKV偏置，默认为True
        hidden_dropout_prob=0.0,  # 隐藏层Dropout概率，默认为0.0
        attention_probs_dropout_prob=0.0,  # 注意力概率Dropout概率，默认为0.0
        drop_path_rate=0.1,  # DropPath概率，默认为0.1
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        use_absolute_embeddings=False,  # 是否使用绝对位置嵌入，默认为False
        initializer_range=0.02,  # 初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # Layer Norm的epsilon值，默认为1e-5
        encoder_stride=32,  # 编码器的步长，默认为32
        out_features=None,  # 输出特征列表，用于对齐特征，默认为None
        out_indices=None,  # 输出索引列表，用于对齐索引，默认为None
        **kwargs,  # 其他关键字参数
    ):
        super().__init__(**kwargs)  # 调用父类的初始化函数
    
        self.image_size = image_size  # 初始化图像尺寸属性
        self.patch_size = patch_size  # 初始化patch大小属性
        self.num_channels = num_channels  # 初始化通道数属性
        self.embed_dim = embed_dim  # 初始化嵌入维度属性
        self.depths = depths  # 初始化深度列表属性
        self.num_layers = len(depths)  # 计算层数并初始化属性
        self.num_heads = num_heads  # 初始化注意力头数列表属性
        self.window_size = window_size  # 初始化窗口大小属性
        self.mlp_ratio = mlp_ratio  # 初始化MLP扩展比例属性
        self.qkv_bias = qkv_bias  # 初始化QKV偏置属性
        self.hidden_dropout_prob = hidden_dropout_prob  # 初始化隐藏层Dropout概率属性
        self.attention_probs_dropout_prob = attention_probs_dropout_prob  # 初始化注意力概率Dropout概率属性
        self.drop_path_rate = drop_path_rate  # 初始化DropPath概率属性
        self.hidden_act = hidden_act  # 初始化隐藏层激活函数属性
        self.use_absolute_embeddings = use_absolute_embeddings  # 初始化使用绝对位置嵌入属性
        self.layer_norm_eps = layer_norm_eps  # 初始化Layer Norm的epsilon值属性
        self.initializer_range = initializer_range  # 初始化初始化范围属性
        self.encoder_stride = encoder_stride  # 初始化编码器步长属性
    
        # 设置隐藏大小属性，以使Swin与VisionEncoderDecoderModel配合工作
        # 这指示模型最后一个阶段之后的通道维度
        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
    
        # 设置阶段名称列表，包括stem和各个阶段（例如stage1、stage2等）
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
    
        # 获取对齐的输出特征和输出索引，用于与给定的阶段名称对齐
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )
# 定义 SwinOnnxConfig 类，继承自 OnnxConfig 类
class SwinOnnxConfig(OnnxConfig):
    # 设定 torch_onnx_minimum_version 属性为版本号 1.11
    torch_onnx_minimum_version = version.parse("1.11")

    # inputs 属性，返回一个有序字典，描述输入数据的结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        return OrderedDict(
            [
                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
            ]
        )

    # atol_for_validation 属性，返回用于验证的绝对容差值
    @property
    def atol_for_validation(self) -> float:
        return 1e-4

`.\models\swin\convert_swin_simmim_to_pytorch.py`

# 编码声明，指定使用 UTF-8 编码格式
# Copyright 2022 The HuggingFace Inc. team.
# 版权声明，版权归 The HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证版本 2.0 许可使用此文件；除非符合许可证的条款，否则不得使用此文件
# 您可以在以下网址获得许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 无任何形式的明示或暗示担保或条件
# 请参阅许可证以了解特定的语言权限和限制

"""从原始存储库中转换 Swin SimMIM 检查点。

URL: https://github.com/microsoft/Swin-Transformer/blob/main/MODELHUB.md#simmim-pretrained-swin-v1-models"""

# 导入必要的库和模块
import argparse  # 参数解析模块

import requests  # HTTP 请求库
import torch  # PyTorch 深度学习框架
from PIL import Image  # Python 图像处理库

from transformers import SwinConfig, SwinForMaskedImageModeling, ViTImageProcessor  # 导入 Transformers 库中的类


def get_swin_config(model_name):
    # 根据模型名称获取 Swin 模型配置
    config = SwinConfig(image_size=192)

    if "base" in model_name:
        # 如果模型名称包含“base”，设置特定参数
        window_size = 6
        embed_dim = 128
        depths = (2, 2, 18, 2)
        num_heads = (4, 8, 16, 32)
    elif "large" in model_name:
        # 如果模型名称包含“large”，设置特定参数
        window_size = 12
        embed_dim = 192
        depths = (2, 2, 18, 2)
        num_heads = (6, 12, 24, 48)
    else:
        # 抛出错误，仅支持“base”和“large”变体的模型
        raise ValueError("Model not supported, only supports base and large variants")

    # 设置配置对象的参数
    config.window_size = window_size
    config.embed_dim = embed_dim
    config.depths = depths
    config.num_heads = num_heads

    return config


def rename_key(name):
    # 重命名模型的键名称，以便适应 Swin 模型的结构
    if "encoder.mask_token" in name:
        name = name.replace("encoder.mask_token", "embeddings.mask_token")
    if "encoder.patch_embed.proj" in name:
        name = name.replace("encoder.patch_embed.proj", "embeddings.patch_embeddings.projection")
    if "encoder.patch_embed.norm" in name:
        name = name.replace("encoder.patch_embed.norm", "embeddings.norm")
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    if "attn" in name:
        name = name.replace("attn", "attention.self")
    if "norm1" in name:
        name = name.replace("norm1", "layernorm_before")
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")

    if name == "encoder.norm.weight":
        name = "layernorm.weight"
    if name == "encoder.norm.bias":
        name = "layernorm.bias"

    # 如果不包含“decoder”，则添加前缀“swin.”
    if "decoder" in name:
        pass
    else:
        name = "swin." + name

    return name


def convert_state_dict(orig_state_dict, model):
    # 定义函数，用于转换模型状态字典
    # 遍历原始状态字典中的键列表副本
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键名包含 "attn_mask"，则跳过不处理
        if "attn_mask" in key:
            pass
        # 如果键名包含 "qkv"
        elif "qkv" in key:
            # 根据 "." 分割键名，提取层号和块号
            key_split = key.split(".")
            layer_num = int(key_split[2])
            block_num = int(key_split[4])
            # 获取当前注意力层的维度信息
            dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size

            # 如果键名包含 "weight"
            if "weight" in key:
                # 更新键名和对应的值到原始状态字典中，分别更新查询、键、值的权重部分
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                ] = val[:dim, :]
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
                ] = val[dim : dim * 2, :]
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                ] = val[-dim:, :]
            else:
                # 更新键名和对应的值到原始状态字典中，分别更新查询、键、值的偏置部分
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
                ] = val[:dim]
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
                ] = val[dim : dim * 2]
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
                ] = val[-dim:]
        else:
            # 使用自定义函数将键名转换后更新到原始状态字典中
            orig_state_dict[rename_key(key)] = val

    # 返回更新后的原始状态字典
    return orig_state_dict
# 导入必要的库
import argparse
import requests
from PIL import Image
import torch
from transformers import SwinForMaskedImageModeling, ViTImageProcessor

# 定义函数，用于将指定的 Swin 模型检查点转换为 PyTorch 模型
def convert_swin_checkpoint(model_name, checkpoint_path, pytorch_dump_folder_path, push_to_hub):
    # 加载指定路径的检查点，并从中提取模型的状态字典
    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]

    # 获取指定模型名称的配置
    config = get_swin_config(model_name)
    # 根据配置创建 Swin 模型对象
    model = SwinForMaskedImageModeling(config)
    # 设置模型为评估模式
    model.eval()

    # 转换模型的状态字典格式
    new_state_dict = convert_state_dict(state_dict, model)
    # 加载转换后的状态字典到模型中
    model.load_state_dict(new_state_dict)

    # 需要处理的图片的 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"

    # 创建图像处理器对象，指定输出图像的大小
    image_processor = ViTImageProcessor(size={"height": 192, "width": 192})
    # 使用 requests 库获取并打开指定 URL 的图像，并用 PIL 库打开
    image = Image.open(requests.get(url, stream=True).raw)
    # 使用图像处理器处理图像，将图像转换为 PyTorch 张量格式
    inputs = image_processor(images=image, return_tensors="pt")

    # 关闭 PyTorch 自动求导功能，因为只需要进行推断
    with torch.no_grad():
        # 使用模型进行推断，获取输出 logits
        outputs = model(**inputs).logits

    # 打印模型输出的键
    print(outputs.keys())
    # 输出消息，确认一切正常
    print("Looks ok!")

    # 如果指定了输出目录路径
    if pytorch_dump_folder_path is not None:
        # 打印消息，保存模型到指定目录
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        # 将模型保存到指定的目录
        model.save_pretrained(pytorch_dump_folder_path)

        # 打印消息，保存图像处理器到指定目录
        print(f"Saving image processor to {pytorch_dump_folder_path}")
        # 将图像处理器保存到指定的目录
        image_processor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要将模型推送到 hub
    if push_to_hub:
        # 打印消息，将模型和图像处理器推送到 hub
        print(f"Pushing model and image processor for {model_name} to hub")
        # 推送模型到 hub
        model.push_to_hub(f"microsoft/{model_name}")
        # 推送图像处理器到 hub
        image_processor.push_to_hub(f"microsoft/{model_name}")


# 如果该脚本作为主程序运行
if __name__ == "__main__":
    # 创建命令行参数解析器对象
    parser = argparse.ArgumentParser()
    
    # 添加必需参数：模型名称
    parser.add_argument(
        "--model_name",
        default="swin-base-simmim-window6-192",
        type=str,
        choices=["swin-base-simmim-window6-192", "swin-large-simmim-window12-192"],
        help="Name of the Swin SimMIM model you'd like to convert.",
    )
    
    # 添加必需参数：原始检查点文件路径
    parser.add_argument(
        "--checkpoint_path",
        default="/Users/nielsrogge/Documents/SwinSimMIM/simmim_pretrain__swin_base__img192_window6__100ep.pth",
        type=str,
        help="Path to the original PyTorch checkpoint (.pth file).",
    )
    
    # 添加可选参数：输出 PyTorch 模型目录路径
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    
    # 添加可选标志：是否将模型推送到 hub
    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )

    # 解析命令行参数
    args = parser.parse_args()
    # 调用转换函数，传入命令行参数
    convert_swin_checkpoint(args.model_name, args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)

`.\models\swin\convert_swin_timm_to_pytorch.py`

# 导入必要的模块：命令行参数解析、JSON 数据处理、HTTP 请求、模型库导入、深度学习框架导入、图像处理库导入
import argparse
import json
import requests
import timm
import torch
from huggingface_hub import hf_hub_download
from PIL import Image

from transformers import AutoImageProcessor, SwinConfig, SwinForImageClassification

# 根据模型名称获取相应的 Swin Transformer 配置
def get_swin_config(swin_name):
    config = SwinConfig()

    # 解析模型名称
    name_split = swin_name.split("_")
    
    # 根据模型名称中的信息设置不同的参数
    model_size = name_split[1]
    img_size = int(name_split[4])
    window_size = int(name_split[3][-1])

    if model_size == "tiny":
        embed_dim = 96
        depths = (2, 2, 6, 2)
        num_heads = (3, 6, 12, 24)
    elif model_size == "small":
        embed_dim = 96
        depths = (2, 2, 18, 2)
        num_heads = (3, 6, 12, 24)
    elif model_size == "base":
        embed_dim = 128
        depths = (2, 2, 18, 2)
        num_heads = (4, 8, 16, 32)
    else:
        embed_dim = 192
        depths = (2, 2, 18, 2)
        num_heads = (6, 12, 24, 48)

    # 根据模型名称中的信息设置不同的分类数和标签映射
    if "in22k" in swin_name:
        num_classes = 21841
    else:
        num_classes = 1000
        # 从 Hugging Face Hub 下载并加载 ImageNet 分类标签映射
        repo_id = "huggingface/label-files"
        filename = "imagenet-1k-id2label.json"
        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
        id2label = {int(k): v for k, v in id2label.items()}
        config.id2label = id2label
        config.label2id = {v: k for k, v in id2label.items()}

    # 设置 Swin Transformer 配置对象的各项参数
    config.image_size = img_size
    config.num_labels = num_classes
    config.embed_dim = embed_dim
    config.depths = depths
    config.num_heads = num_heads
    config.window_size = window_size

    return config

# 根据模型中的参数名字进行重命名，以适应不同的模型加载需求
def rename_key(name):
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    if "patch_embed.norm" in name:
        name = name.replace("patch_embed.norm", "embeddings.norm")
    if "layers" in name:
        name = "encoder." + name
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    if "attn" in name:
        name = name.replace("attn", "attention.self")
    if "norm1" in name:
        name = name.replace("norm1", "layernorm_before")
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")

    if name == "norm.weight":
        name = "layernorm.weight"
    if name == "norm.bias":
        name = "layernorm.bias"

    if "head" in name:
        name = name.replace("head", "classifier")
    else:
        name = "swin." + name

    return name

# 将原始模型状态字典进行转换以适应不同命名的加载
def convert_state_dict(orig_state_dict, model):
    # 遍历原始状态字典的键列表的副本
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值，并赋给变量val
        val = orig_state_dict.pop(key)

        # 如果键名中包含 "mask"，则跳过处理当前循环的剩余部分
        if "mask" in key:
            continue
        # 如果键名中包含 "qkv"
        elif "qkv" in key:
            # 将键名按 "." 分割成列表
            key_split = key.split(".")
            # 从键名中获取层号和块号，并转换为整数
            layer_num = int(key_split[1])
            block_num = int(key_split[3])
            # 获取模型中对应位置的注意力机制的维度大小
            dim = model.swin.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size

            # 如果键名中包含 "weight"
            if "weight" in key:
                # 更新原始状态字典中的权重相关键值对
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
                ] = val[:dim, :]  # 更新查询权重
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
                ] = val[dim:dim * 2, :]  # 更新键权重
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
                ] = val[-dim:, :]  # 更新值权重
            else:
                # 更新原始状态字典中的偏置相关键值对
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
                ] = val[:dim]  # 更新查询偏置
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
                ] = val[dim:dim * 2]  # 更新键偏置
                orig_state_dict[
                    f"swin.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
                ] = val[-dim:]  # 更新值偏置
        else:
            # 对于不包含 "mask" 和 "qkv" 的键名，通过特定函数重命名后更新原始状态字典
            orig_state_dict[rename_key(key)] = val

    # 返回更新后的原始状态字典
    return orig_state_dict
# 导入必要的库和模块
import argparse
import requests
from PIL import Image
import torch
import timm
from transformers import AutoImageProcessor
from swin_transformer import SwinForImageClassification
from utils import convert_state_dict, get_swin_config

# 定义函数用于将 timm 模型转换为 Swin 模型
def convert_swin_checkpoint(swin_name, pytorch_dump_folder_path):
    # 使用 timm 库创建指定预训练模型，并设置为评估模式
    timm_model = timm.create_model(swin_name, pretrained=True)
    timm_model.eval()

    # 获取指定 Swin 模型配置
    config = get_swin_config(swin_name)
    # 使用配置创建 SwinForImageClassification 模型对象，并设置为评估模式
    model = SwinForImageClassification(config)
    model.eval()

    # 转换 timm 模型的状态字典到 Swin 模型兼容格式
    new_state_dict = convert_state_dict(timm_model.state_dict(), model)
    # 加载新的状态字典到 Swin 模型
    model.load_state_dict(new_state_dict)

    # 指定测试用的图像 URL
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"

    # 从预训练模型名称创建图像处理器对象
    image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swin_name.replace("_", "-")))
    # 从 URL 获取图像的流数据并打开为图像对象
    image = Image.open(requests.get(url, stream=True).raw)
    # 使用图像处理器处理图像，并转换为 PyTorch 张量
    inputs = image_processor(images=image, return_tensors="pt")

    # 使用 timm 模型处理图像输入并获取输出
    timm_outs = timm_model(inputs["pixel_values"])
    # 使用 Swin 模型处理图像输入并获取输出 logits
    hf_outs = model(**inputs).logits

    # 断言两个模型输出在指定的绝对误差范围内相等
    assert torch.allclose(timm_outs, hf_outs, atol=1e-3)

    # 打印保存模型的信息，包括模型名称和保存路径
    print(f"Saving model {swin_name} to {pytorch_dump_folder_path}")
    # 将 Swin 模型保存到指定路径
    model.save_pretrained(pytorch_dump_folder_path)

    # 打印保存图像处理器的信息，包括保存路径
    print(f"Saving image processor to {pytorch_dump_folder_path}")
    # 将图像处理器保存到同一路径
    image_processor.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    # 添加必需参数：swin_name，指定要转换的 Swin timm 模型名称
    parser.add_argument(
        "--swin_name",
        default="swin_tiny_patch4_window7_224",
        type=str,
        help="Name of the Swin timm model you'd like to convert.",
    )
    # 添加必需参数：pytorch_dump_folder_path，指定 PyTorch 模型输出目录路径
    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )

    # 解析命令行参数
    args = parser.parse_args()
    # 调用 convert_swin_checkpoint 函数，传入解析后的参数值
    convert_swin_checkpoint(args.swin_name, args.pytorch_dump_folder_path)

`.\models\swin\modeling_swin.py`

# 设置文件编码为 UTF-8
# 版权声明，版权归 Microsoft Research 和 HuggingFace Inc. 团队所有

# 根据 Apache 许可证 2.0 版本，除非符合许可证，否则不得使用此文件
# 可以在以下链接获取许可证副本：http://www.apache.org/licenses/LICENSE-2.0

# 除非适用法律要求或书面同意，否则本软件根据“原样”分发，不附带任何形式的保证或条件
# 详见许可证，以获取具体的语言表达和限制条件

""" PyTorch Swin Transformer model."""

# 导入必要的模块和库
import collections.abc  # 导入 collections.abc 模块
import math  # 导入 math 模块
import warnings  # 导入 warnings 模块
from dataclasses import dataclass  # 从 dataclasses 模块导入 dataclass 装饰器
from typing import Optional, Tuple, Union  # 导入类型提示所需的类和元组

import torch  # 导入 PyTorch 库
import torch.utils.checkpoint  # 导入 PyTorch 的 checkpoint 模块
from torch import nn  # 从 torch 导入 nn 模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # 从 torch.nn 导入损失函数

# 导入自定义模块和函数
from ...activations import ACT2FN  # 从上级目录导入 ACT2FN 函数
from ...modeling_outputs import BackboneOutput  # 从上级目录导入 BackboneOutput 类
from ...modeling_utils import PreTrainedModel  # 从上级目录导入 PreTrainedModel 类
from ...pytorch_utils import (  # 从上级目录导入多个函数和类
    find_pruneable_heads_and_indices,
    meshgrid,
    prune_linear_layer
)
from ...utils import (  # 从上级目录导入多个实用函数和类
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings
)
from ...utils.backbone_utils import BackboneMixin  # 从上级目录导入 BackboneMixin 类
from .configuration_swin import SwinConfig  # 从当前目录导入 SwinConfig 类

# 获取 logger 对象
logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "SwinConfig"  # _CONFIG_FOR_DOC 变量的文档说明

# Base docstring
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224"  # _CHECKPOINT_FOR_DOC 变量的文档说明
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768]  # _EXPECTED_OUTPUT_SHAPE 变量的文档说明

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "microsoft/swin-tiny-patch4-window7-224"  # _IMAGE_CLASS_CHECKPOINT 变量的文档说明
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"  # _IMAGE_CLASS_EXPECTED_OUTPUT 变量的文档说明

# Swin 预训练模型的存档列表
SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/swin-tiny-patch4-window7-224",
    # 查看所有 Swin 模型：https://huggingface.co/models?filter=swin
]

# drop_path, SwinPatchEmbeddings, SwinPatchMerging 和 SwinDropPath 来自 timm 库
    # 定义函数参数和返回值的类型注解，使用了 Torch 库中的数据类型说明
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            模型最后一层的隐藏状态序列，形状为 `(batch_size, sequence_length, hidden_size)`.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            一个元组，包含了模型每一层的隐藏状态，形状为 `(batch_size, sequence_length, hidden_size)`.
    
            模型每一层的隐藏状态以及初始嵌入输出的列表。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            一个元组，包含了每个阶段的注意力权重，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`.
    
            经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均值。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            一个元组，包含了模型每一层的隐藏状态，形状为 `(batch_size, hidden_size, height, width)`.
    
            模型每一层的隐藏状态以及初始嵌入输出，重塑以包含空间维度。
    """
    
    # 定义变量并初始化为 None，用来存储模型输出的各种信息
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@dataclass
class SwinModelOutput(ModelOutput):
    """
    Swin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    last_hidden_state: torch.FloatTensor = None
    pooler_output: Optional[torch.FloatTensor] = None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None


@dataclass
class SwinMaskedImageModelingOutput(ModelOutput):
    """
    Swin masked image model outputs.

    This class is a data structure to hold outputs from a Swin Transformer model applied to masked image inputs.
    It extends the `ModelOutput` class.

    It includes the following attributes:

    """
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
            图像掩码建模（MLM）损失，当提供 `bool_masked_pos` 时返回。
        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
            重建的像素值，形状为 `(batch_size, num_channels, height, width)`。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
            模型每层输出的隐藏状态，包括初始嵌入输出。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
            经过注意力 softmax 后的注意力权重，用于计算自注意力头中的加权平均。
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
            每层输出的隐藏状态，包括重塑以包含空间维度的初始嵌入输出。

    """

    # Optional 类型的变量定义，用于存储损失值，默认为 None
    loss: Optional[torch.FloatTensor] = None
    # 定义变量，存储重建后的像素值，类型为 torch.FloatTensor
    reconstruction: torch.FloatTensor = None
    # Optional 类型的变量定义，存储隐藏状态的元组，默认为 None
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # Optional 类型的变量定义，存储注意力权重的元组，默认为 None
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
    # Optional 类型的变量定义，存储重塑后的隐藏状态的元组，默认为 None
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None

    @property
    def logits(self):
        # 发出警告信息，提示 logits 属性即将在 Transformers 的第 5 版中被移除
        warnings.warn(
            "logits attribute is deprecated and will be removed in version 5 of Transformers."
            " Please use the reconstruction attribute to retrieve the final output instead.",
            FutureWarning,
        )
        # 返回 reconstruction 属性作为最终输出
        return self.reconstruction
@dataclass
class SwinImageClassifierOutput(ModelOutput):
    """
    Swin outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    """

    loss: Optional[torch.FloatTensor] = None  # 分类（或回归，如果config.num_labels==1）的损失值
    logits: torch.FloatTensor = None  # 分类（或回归，如果config.num_labels==1）的得分（SoftMax之前）
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None  # 模型每层输出的隐藏状态及初始嵌入输出的元组
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None  # 注意力权重的元组，用于计算自注意力头中的加权平均值
    reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None  # 每层输出的隐藏状态及初始嵌入输出的元组，包括空间维度

def window_partition(input_feature, window_size):
    """
    Partitions the given input into windows.
    """
    batch_size, height, width, num_channels = input_feature.shape  # 获取输入特征的形状信息
    input_feature = input_feature.view(  # 将输入特征重塑为窗口大小的块
        batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
    )
    windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(  # 对重塑后的特征进行维度置换和重新排序，得到窗口列表
        -1, window_size, window_size, num_channels)
    return windows  # 返回分区后的窗口列表

def window_reverse(windows, window_size, height, width):
    """
    Merges windows to produce higher resolution features.
    """
    num_channels = windows.shape[-1]  # 获取窗口的通道数
    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)  # 将窗口重新排列以恢复高分辨率特征
    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(  # 对重排的窗口进行维度置换和重新排序
        -1, height, width, num_channels)
    # 返回 windows 变量的值作为函数的结果
    return windows
class SwinEmbeddings(nn.Module):
    """
    Construct the patch and position embeddings. Optionally, also the mask token.
    """

    def __init__(self, config, use_mask_token=False):
        super().__init__()

        self.patch_embeddings = SwinPatchEmbeddings(config)
        num_patches = self.patch_embeddings.num_patches  # 获取patch嵌入的patch数目
        self.patch_grid = self.patch_embeddings.grid_size  # 获取patch网格大小
        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None  # 如果使用mask token，则创建一个可训练的零张量

        if config.use_absolute_embeddings:
            self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))  # 如果使用绝对位置嵌入，则创建一个可训练的零张量
        else:
            self.position_embeddings = None  # 否则位置嵌入设为None

        self.norm = nn.LayerNorm(config.embed_dim)  # 创建LayerNorm层，用于标准化嵌入
        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 创建Dropout层，用于随机dropout

    def forward(
        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
    ) -> Tuple[torch.Tensor]:
        embeddings, output_dimensions = self.patch_embeddings(pixel_values)  # 获得patch嵌入和输出维度
        embeddings = self.norm(embeddings)  # 对嵌入进行标准化
        batch_size, seq_len, _ = embeddings.size()  # 获取批量大小、序列长度和嵌入维度

        if bool_masked_pos is not None:
            mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)  # 将mask token扩展到与嵌入张量相同的维度
            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)  # 将布尔掩码转换为与mask token相同类型的张量
            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask  # 根据掩码替换被遮蔽的视觉token为mask token

        if self.position_embeddings is not None:
            embeddings = embeddings + self.position_embeddings  # 如果存在位置嵌入，则加上位置嵌入

        embeddings = self.dropout(embeddings)  # 对嵌入进行随机dropout

        return embeddings, output_dimensions


class SwinPatchEmbeddings(nn.Module):
    """
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    """

    def __init__(self, config):
        super().__init__()
        image_size, patch_size = config.image_size, config.patch_size
        num_channels, hidden_size = config.num_channels, config.embed_dim
        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])  # 计算图像分割成patch后的数量
        self.image_size = image_size  # 图像大小
        self.patch_size = patch_size  # patch大小
        self.num_channels = num_channels  # 输入通道数
        self.num_patches = num_patches  # patch数目
        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])  # patch的网格大小

        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
        # 创建2D卷积层，用于将输入的像素值转换为patch嵌入
    # 可能对输入进行填充，使其能被 self.patch_size 整除（如果需要）
    def maybe_pad(self, pixel_values, height, width):
        if width % self.patch_size[1] != 0:
            # 计算需要填充的值，使得宽度能够被 patch_size[1] 整除
            pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
            # 使用 nn.functional.pad 函数进行填充操作
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        if height % self.patch_size[0] != 0:
            # 计算需要填充的值，使得高度能够被 patch_size[0] 整除
            pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
            # 使用 nn.functional.pad 函数进行填充操作
            pixel_values = nn.functional.pad(pixel_values, pad_values)
        return pixel_values

    # 前向传播函数，接受像素值作为输入，返回嵌入张量和输出尺寸元组
    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
        # 获取输入张量的形状信息
        _, num_channels, height, width = pixel_values.shape
        # 检查通道维度是否与配置中设置的通道数匹配，如果不匹配则引发 ValueError
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )
        # 可能对输入进行填充操作，使其能被 patch_size 整除
        pixel_values = self.maybe_pad(pixel_values, height, width)
        # 将填充后的输入传递给投影层，生成嵌入张量
        embeddings = self.projection(pixel_values)
        # 获取嵌入张量的形状信息
        _, _, height, width = embeddings.shape
        # 计算输出的高度和宽度尺寸
        output_dimensions = (height, width)
        # 将嵌入张量展平并转置，以便输出
        embeddings = embeddings.flatten(2).transpose(1, 2)

        return embeddings, output_dimensions
class SwinPatchMerging(nn.Module):
    """
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    """

    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
        super().__init__()
        self.input_resolution = input_resolution  # 存储输入特征的分辨率
        self.dim = dim  # 存储输入通道数
        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)  # 线性变换，用于特征维度的降维
        self.norm = norm_layer(4 * dim)  # 使用指定的归一化层对特征进行归一化

    def maybe_pad(self, input_feature, height, width):
        should_pad = (height % 2 == 1) or (width % 2 == 1)
        if should_pad:
            pad_values = (0, 0, 0, width % 2, 0, height % 2)
            input_feature = nn.functional.pad(input_feature, pad_values)  # 如果需要，对输入特征进行填充

        return input_feature

    def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
        height, width = input_dimensions
        # `dim` is height * width
        batch_size, dim, num_channels = input_feature.shape

        input_feature = input_feature.view(batch_size, height, width, num_channels)  # 重新组织输入特征的形状
        input_feature = self.maybe_pad(input_feature, height, width)  # 调用填充函数进行可能的填充操作

        # 分割输入特征并组合成新的特征图
        input_feature_0 = input_feature[:, 0::2, 0::2, :]  # 取出每隔一个像素的子图
        input_feature_1 = input_feature[:, 1::2, 0::2, :]  # 取出每隔一个像素的子图
        input_feature_2 = input_feature[:, 0::2, 1::2, :]  # 取出每隔一个像素的子图
        input_feature_3 = input_feature[:, 1::2, 1::2, :]  # 取出每隔一个像素的子图

        # 将分割的子图按通道方向连接起来
        input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)

        # 重新调整形状，合并后的特征图
        input_feature = input_feature.view(batch_size, -1, 4 * num_channels)

        input_feature = self.norm(input_feature)  # 对连接后的特征进行归一化
        input_feature = self.reduction(input_feature)  # 使用线性变换减少特征维度

        return input_feature


# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    """
    # 如果 drop_prob 为 0 或者不处于训练状态，则直接返回输入，无需执行 Dropout
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留概率
    keep_prob = 1 - drop_prob
    # 创建形状与输入相同的随机张量，用于决定每个神经元的保留情况
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # 适用于不同维度张量，不仅限于2D卷积网络
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化为0或1
    # 应用 Dropout，乘以随机张量并除以保留概率
    output = input.div(keep_prob) * random_tensor
    # 返回 Dropout 后的输出
    return output
# 从 transformers.models.swin.modeling_swin.SwinDropPath 复制，并将 Beit->Swin
class SwinDropPath(nn.Module):
    """每个样本应用于残差块主路径的丢弃路径（随机深度）。"""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        return "p={}".format(self.drop_prob)


class SwinSelfAttention(nn.Module):
    def __init__(self, config, dim, num_heads, window_size):
        super().__init__()
        if dim % num_heads != 0:
            raise ValueError(
                f"隐藏大小 ({dim}) 不是注意力头数 ({num_heads}) 的倍数"
            )

        self.num_attention_heads = num_heads
        self.attention_head_size = int(dim / num_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.window_size = (
            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
        )

        self.relative_position_bias_table = nn.Parameter(
            torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads)
        )

        # 获取每个窗口内每个标记的成对相对位置索引
        coords_h = torch.arange(self.window_size[0])
        coords_w = torch.arange(self.window_size[1])
        coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij"))
        coords_flatten = torch.flatten(coords, 1)
        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
        relative_coords = relative_coords.permute(1, 2, 0).contiguous()
        relative_coords[:, :, 0] += self.window_size[0] - 1
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)
        self.register_buffer("relative_position_index", relative_position_index)

        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        ) -> Tuple[torch.Tensor]:  # 函数定义，返回一个元组，包含一个 torch.Tensor
        batch_size, dim, num_channels = hidden_states.shape  # 获取隐藏状态的批量大小、维度和通道数
        mixed_query_layer = self.query(hidden_states)  # 使用查询函数处理隐藏状态得到混合查询层

        key_layer = self.transpose_for_scores(self.key(hidden_states))  # 使用键函数处理隐藏状态并转置得到键层
        value_layer = self.transpose_for_scores(self.value(hidden_states))  # 使用值函数处理隐藏状态并转置得到值层
        query_layer = self.transpose_for_scores(mixed_query_layer)  # 转置处理混合查询层得到查询层

        # 计算查询与键的点积，得到原始注意力分数
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))

        attention_scores = attention_scores / math.sqrt(self.attention_head_size)  # 根据头部大小对注意力分数进行缩放

        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)]
        relative_position_bias = relative_position_bias.view(
            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
        )

        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()
        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)  # 添加相对位置偏置到注意力分数中

        if attention_mask is not None:
            # 如果存在注意力掩码，则应用它（在 SwinModel 的 forward() 函数中预先计算）
            mask_shape = attention_mask.shape[0]
            attention_scores = attention_scores.view(
                batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
            )
            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
            attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)

        # 将注意力分数归一化为注意力概率
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # 使用 dropout 随机丢弃整个 token 的注意力概率
        attention_probs = self.dropout(attention_probs)

        # 如果有头部掩码，则应用它
        if head_mask is not None:
            attention_probs = attention_probs * head_mask

        context_layer = torch.matmul(attention_probs, value_layer)  # 计算上下文层
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)

        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)  # 准备输出结果

        return outputs  # 返回计算后的输出
# 定义一个名为 SwinSelfOutput 的类，继承自 nn.Module
class SwinSelfOutput(nn.Module):
    # 初始化方法，接收 config 和 dim 参数
    def __init__(self, config, dim):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，输入和输出维度均为 dim
        self.dense = nn.Linear(dim, dim)
        # 创建一个 Dropout 层，使用 config 中的注意力概率参数
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    # 前向传播方法，接收 hidden_states 和 input_tensor 两个 Tensor 作为输入，返回一个 Tensor
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将 hidden_states 输入到 self.dense 中进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的结果使用 Dropout 层
        hidden_states = self.dropout(hidden_states)

        # 返回处理后的 hidden_states
        return hidden_states


# 定义一个名为 SwinAttention 的类，继承自 nn.Module
class SwinAttention(nn.Module):
    # 初始化方法，接收 config、dim、num_heads 和 window_size 参数
    def __init__(self, config, dim, num_heads, window_size):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个 SwinSelfAttention 对象，传入 config、dim、num_heads 和 window_size 参数
        self.self = SwinSelfAttention(config, dim, num_heads, window_size)
        # 创建一个 SwinSelfOutput 对象，传入 config 和 dim 参数
        self.output = SwinSelfOutput(config, dim)
        # 创建一个空集合，用于存储需要剪枝的注意力头索引
        self.pruned_heads = set()

    # 剪枝注意力头的方法，接收 heads 参数
    def prune_heads(self, heads):
        # 如果 heads 集合为空，则直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数，获取可剪枝的头部索引和相关信息
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 对 self.self 中的 query、key、value 和 output.dense 层进行剪枝
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储被剪枝的头部索引
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播方法，接收 hidden_states、attention_mask、head_mask 和 output_attentions 四个参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self.self 的前向传播方法，计算自注意力输出
        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
        # 将 self_outputs[0] 和 hidden_states 输入到 self.output 中，计算注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 将注意力输出和可能的额外注意力返回作为元组 outputs 的一部分
        outputs = (attention_output,) + self_outputs[1:]  # 如果有额外的注意力输出，添加到 outputs 中
        return outputs


# 定义一个名为 SwinIntermediate 的类，继承自 nn.Module
class SwinIntermediate(nn.Module):
    # 初始化方法，接收 config 和 dim 参数
    def __init__(self, config, dim):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，将 dim 维度映射到 config.mlp_ratio * dim 维度
        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
        # 如果 config.hidden_act 是字符串类型，则使用 ACT2FN 字典中对应的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播方法，接收 hidden_states 作为输入，返回处理后的 hidden_states
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将 hidden_states 输入到 self.dense 中进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的结果使用 intermediate_act_fn 激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回处理后的 hidden_states
        return hidden_states


# 定义一个名为 SwinOutput 的类，继承自 nn.Module
class SwinOutput(nn.Module):
    # 初始化方法，接收 config 和 dim 参数
    def __init__(self, config, dim):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个线性层，将 config.mlp_ratio * dim 维度映射到 dim 维度
        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
        # 创建一个 Dropout 层，使用 config 中的隐藏层 Dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
    # 定义前向传播方法，接受隐藏状态张量作为输入，并返回处理后的张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量通过全连接层进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的张量应用丢弃（dropout）操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 返回经过全连接层和丢弃操作后的张量作为输出
        return hidden_states
    # SwinLayer 类定义，继承自 nn.Module
    class SwinLayer(nn.Module):
        # 初始化函数，接受配置、维度、输入分辨率、头数、偏移大小等参数
        def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
            super().__init__()
            # 设置块大小用于前馈传播
            self.chunk_size_feed_forward = config.chunk_size_feed_forward
            # 设置偏移大小
            self.shift_size = shift_size
            # 窗口大小
            self.window_size = config.window_size
            # 输入分辨率
            self.input_resolution = input_resolution
            # 在注意力操作前使用 LayerNorm
            self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
            # 使用 SwinAttention 类进行注意力计算
            self.attention = SwinAttention(config, dim, num_heads, window_size=self.window_size)
            # 如果存在丢弃路径率，则应用 SwinDropPath；否则使用恒等映射
            self.drop_path = SwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
            # 在注意力操作后使用 LayerNorm
            self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
            # SwinIntermediate 类，处理注意力后的中间层输出
            self.intermediate = SwinIntermediate(config, dim)
            # SwinOutput 类，生成最终的输出
            self.output = SwinOutput(config, dim)

        # 设置偏移和窗口大小的方法，根据输入分辨率调整
        def set_shift_and_window_size(self, input_resolution):
            if min(input_resolution) <= self.window_size:
                # 如果窗口大小大于输入分辨率，则不分割窗口
                self.shift_size = 0
                self.window_size = min(input_resolution)

        # 生成注意力掩码的方法，根据高度、宽度和数据类型生成不同形状的掩码
        def get_attn_mask(self, height, width, dtype):
            if self.shift_size > 0:
                # 如果设置了偏移大小，则计算 SW-MSA 的注意力掩码
                img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
                height_slices = (
                    slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None),
                )
                width_slices = (
                    slice(0, -self.window_size),
                    slice(-self.window_size, -self.shift_size),
                    slice(-self.shift_size, None),
                )
                count = 0
                for height_slice in height_slices:
                    for width_slice in width_slices:
                        img_mask[:, height_slice, width_slice, :] = count
                        count += 1

                mask_windows = window_partition(img_mask, self.window_size)
                mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
                attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
                attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
            else:
                attn_mask = None
            return attn_mask

        # 在输入状态可能需要填充时进行填充的方法
        def maybe_pad(self, hidden_states, height, width):
            pad_right = (self.window_size - width % self.window_size) % self.window_size
            pad_bottom = (self.window_size - height % self.window_size) % self.window_size
            pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
            hidden_states = nn.functional.pad(hidden_states, pad_values)
            return hidden_states, pad_values

        # 前向传播方法，接受输入状态张量、输入尺寸、头部掩码、输出注意力和是否总是分割的布尔参数
        def forward(
            self,
            hidden_states: torch.Tensor,
            input_dimensions: Tuple[int, int],
            head_mask: Optional[torch.FloatTensor] = None,
            output_attentions: Optional[bool] = False,
            always_partition: Optional[bool] = False,
        ):
        ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 如果不是始终分区，则根据输入维度设置位移和窗口大小
        if not always_partition:
            self.set_shift_and_window_size(input_dimensions)
        else:
            # 如果始终分区，不执行任何操作
            pass
        # 解包输入维度
        height, width = input_dimensions
        # 解包隐藏状态的批量大小、高度、宽度和通道数
        batch_size, _, channels = hidden_states.size()
        # 备份隐藏状态
        shortcut = hidden_states

        # 在层归一化之前应用层归一化
        hidden_states = self.layernorm_before(hidden_states)

        # 将隐藏状态重塑为 [batch_size, height, width, channels] 的形状
        hidden_states = hidden_states.view(batch_size, height, width, channels)

        # 对隐藏状态进行填充，使其大小为窗口大小的倍数
        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)

        # 获取填充后的隐藏状态的维度信息
        _, height_pad, width_pad, _ = hidden_states.shape

        # 如果位移大小大于 0，则对隐藏状态进行循环移位操作
        if self.shift_size > 0:
            shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
        else:
            shifted_hidden_states = hidden_states

        # 分区窗口
        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
        # 将分区后的窗口重塑为 [batch_size * num_windows, window_size * window_size, channels] 的形状
        hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)

        # 获取注意力掩码
        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
        if attn_mask is not None:
            attn_mask = attn_mask.to(hidden_states_windows.device)

        # 应用注意力机制
        attention_outputs = self.attention(
            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
        )

        # 获取注意力输出
        attention_output = attention_outputs[0]

        # 将注意力输出重塑为 [batch_size * num_windows, window_size, window_size, channels] 的形状
        attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)

        # 反转窗口分区操作
        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)

        # 如果位移大小大于 0，则反转循环位移操作
        if self.shift_size > 0:
            attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
        else:
            attention_windows = shifted_windows

        # 检查是否进行了填充
        was_padded = pad_values[3] > 0 or pad_values[5] > 0
        if was_padded:
            # 如果进行了填充，则截取注意力窗口以匹配原始输入尺寸
            attention_windows = attention_windows[:, :height, :width, :].contiguous()

        # 将注意力窗口重塑为 [batch_size, height * width, channels] 的形状
        attention_windows = attention_windows.view(batch_size, height * width, channels)

        # 将捷径添加到注意力窗口上，并应用丢弃路径
        hidden_states = shortcut + self.drop_path(attention_windows)

        # 在层归一化之后应用层归一化
        layer_output = self.layernorm_after(hidden_states)

        # 应用中间层
        layer_output = self.intermediate(layer_output)

        # 应用输出层
        layer_output = hidden_states + self.output(layer_output)

        # 返回层输出及注意力信息（如果需要）
        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
        return layer_outputs
# 定义 SwinStage 类，继承自 nn.Module，用于实现一个 Swin Transformer 的阶段
class SwinStage(nn.Module):
    # 初始化方法，接受多个参数来配置 SwinStage 实例
    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
        super().__init__()
        # 将传入的配置参数保存到实例中
        self.config = config
        # 设置维度参数
        self.dim = dim
        # 创建 SwinLayer 组成的模块列表
        self.blocks = nn.ModuleList(
            [
                SwinLayer(
                    config=config,
                    dim=dim,
                    input_resolution=input_resolution,
                    num_heads=num_heads,
                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
                )
                for i in range(depth)
            ]
        )

        # 如果 downsample 参数不为 None，则创建一个下采样层
        if downsample is not None:
            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
        else:
            self.downsample = None

        # 初始化指向性为 False
        self.pointing = False

    # 前向传播方法，接受多个输入参数并返回多个输出参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        always_partition: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 解包输入维度
        height, width = input_dimensions
        # 遍历 SwinLayer 模块列表
        for i, layer_module in enumerate(self.blocks):
            # 如果 head_mask 不为 None，则取出当前层的头部掩码
            layer_head_mask = head_mask[i] if head_mask is not None else None

            # 调用当前层模块的 forward 方法计算输出
            layer_outputs = layer_module(
                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
            )

            # 更新 hidden_states 为当前层的输出
            hidden_states = layer_outputs[0]

        # 在进行下采样之前保存当前 hidden_states
        hidden_states_before_downsampling = hidden_states
        # 如果存在下采样层，则进行下采样操作
        if self.downsample is not None:
            # 计算下采样后的高度和宽度
            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
            # 更新输出维度信息
            output_dimensions = (height, width, height_downsampled, width_downsampled)
            # 调用下采样层的 forward 方法进行下采样
            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
        else:
            # 没有下采样时，输出维度信息保持不变
            output_dimensions = (height, width, height, width)

        # 组装阶段的所有输出结果
        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)

        # 如果开启了输出注意力权重，则将每层的注意力权重输出加入到 stage_outputs 中
        if output_attentions:
            stage_outputs += layer_outputs[1:]
        
        # 返回阶段的所有输出结果作为元组
        return stage_outputs
    # 初始化函数，接受配置和网格大小作为参数
    def __init__(self, config, grid_size):
        # 调用父类的初始化方法
        super().__init__()
        # 计算网络层数
        self.num_layers = len(config.depths)
        # 存储配置信息
        self.config = config
        # 根据 drop_path_rate 参数生成一组 drop path 概率列表
        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
        # 创建网络层的 ModuleList，每层是一个 SwinStage 实例
        self.layers = nn.ModuleList(
            [
                SwinStage(
                    config=config,
                    # 计算当前层的嵌入维度
                    dim=int(config.embed_dim * 2**i_layer),
                    # 计算输入分辨率
                    input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
                    # 设置当前层的深度
                    depth=config.depths[i_layer],
                    # 设置当前层的注意力头数
                    num_heads=config.num_heads[i_layer],
                    # 提取当前层的 drop path 概率列表
                    drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
                    # 如果不是最后一层，则设置 downsample 为 SwinPatchMerging 类
                    downsample=SwinPatchMerging if (i_layer < self.num_layers - 1) else None,
                )
                # 遍历所有网络层
                for i_layer in range(self.num_layers)
            ]
        )

        # 默认关闭梯度检查点
        self.gradient_checkpointing = False

    # 前向传播函数，接受输入的隐藏状态、输入维度和可选的掩码、输出配置等参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        input_dimensions: Tuple[int, int],
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        output_hidden_states_before_downsampling: Optional[bool] = False,
        always_partition: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        # 后续可能还有其他参数
class SwinPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类为 SwinConfig
    config_class = SwinConfig
    # 基础模型前缀为 "swin"
    base_model_prefix = "swin"
    # 主输入名称为 "pixel_values"
    main_input_name = "pixel_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果是线性层或卷积层，使用正态分布初始化权重
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 与 TensorFlow 版本稍有不同，PyTorch 使用正态分布进行初始化
            # 参考 https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        # 如果是 LayerNorm 层，初始化偏置为零，权重为全1
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


SWIN_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

SWIN_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
    """
    add_pooling_layer (`bool`, *optional*, defaults to `True`):
            是否应用池化层。
    use_mask_token (`bool`, *optional*, defaults to `False`):
            是否在嵌入层中创建和应用掩码标记。
    """
)
# 定义一个名为 SwinModel 的类，继承自 SwinPreTrainedModel
class SwinModel(SwinPreTrainedModel):
    # 初始化方法，接受配置参数 config、是否添加池化层 add_pooling_layer 和是否使用掩码令牌 use_mask_token
    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将参数 config 存储为对象的配置
        self.config = config
        # 计算层数和特征数
        self.num_layers = len(config.depths)
        self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))

        # 创建 SwinEmbeddings 对象并存储在 self.embeddings 中
        self.embeddings = SwinEmbeddings(config, use_mask_token=use_mask_token)
        # 创建 SwinEncoder 对象并存储在 self.encoder 中，传入 patch_grid 参数
        self.encoder = SwinEncoder(config, self.embeddings.patch_grid)

        # 使用 nn.LayerNorm 初始化 layernorm 层，特征数为 self.num_features，epsilon 为 config.layer_norm_eps
        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
        # 如果 add_pooling_layer 为 True，则创建 AdaptiveAvgPool1d 池化层，存储在 self.pooler 中；否则为 None
        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None

        # 调用 post_init 方法完成权重初始化和最终处理
        self.post_init()

    # 获取输入嵌入的方法，返回 patch_embeddings 属性
    def get_input_embeddings(self):
        return self.embeddings.patch_embeddings

    # 头部剪枝方法，用于剪枝模型中的注意力头
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历 heads_to_prune 字典，对每一层指定需要剪枝的注意力头
        for layer, heads in heads_to_prune.items():
            # 调用 encoder 层的每一层的 attention.prune_heads 方法，剪枝指定的注意力头
            self.encoder.layer[layer].attention.prune_heads(heads)

    # forward 方法，模型的前向传播
    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SwinModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> Union[Tuple, SwinModelOutput]:
        r"""
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        """
        # 根据需要设置输出注意力矩阵，默认使用配置中的设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 根据需要设置输出隐藏状态，默认使用配置中的设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 根据需要设置返回类型，默认使用配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if pixel_values is None:
            # 如果未提供像素值，抛出数值错误
            raise ValueError("You have to specify pixel_values")

        # 如果需要，准备头部掩码
        # head_mask 中的 1.0 表示保留该头部
        # attention_probs 的形状为 bsz x n_heads x N x N
        # 输入的 head_mask 形状为 [num_heads] 或者 [num_hidden_layers x num_heads]
        # 将 head_mask 转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
        head_mask = self.get_head_mask(head_mask, len(self.config.depths))

        # 将像素值和布尔掩码位置传递给嵌入层进行处理
        embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)

        # 使用编码器处理嵌入的输出
        encoder_outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的序列输出并进行层归一化
        sequence_output = encoder_outputs[0]
        sequence_output = self.layernorm(sequence_output)

        # 初始化池化输出为 None
        pooled_output = None
        if self.pooler is not None:
            # 如果存在池化层，使用池化层对序列输出进行池化，并展平
            pooled_output = self.pooler(sequence_output.transpose(1, 2))
            pooled_output = torch.flatten(pooled_output, 1)

        if not return_dict:
            # 如果不要求返回字典形式的输出，构造并返回输出元组
            output = (sequence_output, pooled_output) + encoder_outputs[1:]

            return output

        # 如果需要返回字典形式的输出，构造并返回 SwinModelOutput 对象
        return SwinModelOutput(
            last_hidden_state=sequence_output,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
        )
@add_start_docstrings(
    """
    Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    """,
    SWIN_START_DOCSTRING,
)
class SwinForMaskedImageModeling(SwinPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.swin = SwinModel(config, add_pooling_layer=False, use_mask_token=True)

        num_features = int(config.embed_dim * 2 ** (config.num_layers - 1))
        self.decoder = nn.Sequential(
            nn.Conv2d(
                in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
            ),
            nn.PixelShuffle(config.encoder_stride),
        )

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=SwinMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        bool_masked_pos: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Performs forward pass of the Swin model for masked image modeling.

        Args:
            pixel_values (torch.FloatTensor, optional): Tensor of pixel values of shape (batch_size, num_channels, height, width).
            bool_masked_pos (torch.BoolTensor, optional): Boolean mask indicating positions in pixel_values to be masked.
            head_mask (torch.FloatTensor, optional): Mask for attention heads.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary instead of a tuple.

        Returns:
            SwinMaskedImageModelingOutput: Output object containing model outputs.
        """
        # Forward pass implementation is handled in the superclass and decorators

@add_start_docstrings(
    """
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    """,
    SWIN_START_DOCSTRING,
)
class SwinForImageClassification(SwinPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.num_labels = config.num_labels
        self.swin = SwinModel(config)

        # Classifier head
        self.classifier = (
            nn.Linear(self.swin.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
        )

        # Initialize weights and apply final processing
        self.post_init()

    @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,
        output_type=SwinImageClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        Performs forward pass of the Swin model for image classification.

        Args:
            pixel_values (torch.FloatTensor, optional): Tensor of pixel values of shape (batch_size, num_channels, height, width).
            head_mask (torch.FloatTensor, optional): Mask for attention heads.
            labels (torch.LongTensor, optional): Labels for computing the classification loss.
            output_attentions (bool, optional): Whether to output attentions.
            output_hidden_states (bool, optional): Whether to output hidden states.
            return_dict (bool, optional): Whether to return a dictionary instead of a tuple.

        Returns:
            SwinImageClassifierOutput: Output object containing model outputs.
        """
        # Forward pass implementation is handled in the superclass and decorators
        ) -> Union[Tuple, SwinImageClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 确保 return_dict 变量不为 None
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 Swin Transformer 模型进行前向传播
        outputs = self.swin(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中取得池化后的特征表示
        pooled_output = outputs[1]

        # 将池化后的特征表示传入分类器以得到 logits
        logits = self.classifier(pooled_output)

        # 初始化损失为 None
        loss = None
        if labels is not None:
            # 确定问题类型，如果尚未确定
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单标签回归问题，使用均方误差损失函数
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归问题，同样使用均方误差损失函数
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类问题，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类问题，使用带 Logits 的二元交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则返回不包含损失的输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回一个 SwinImageClassifierOutput 对象
        return SwinImageClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            reshaped_hidden_states=outputs.reshaped_hidden_states,
        )
@add_start_docstrings(
    """
    Swin backbone, to be used with frameworks like DETR and MaskFormer.
    """,
    SWIN_START_DOCSTRING,
)
class SwinBackbone(SwinPreTrainedModel, BackboneMixin):
    """
    SwinTransformer的主干网络，可用于DETR和MaskFormer等框架。
    继承自SwinPreTrainedModel和BackboneMixin。
    """

    def __init__(self, config: SwinConfig):
        """
        初始化函数，接收一个SwinConfig类型的参数config。
        """
        super().__init__(config)
        # 调用父类SwinPreTrainedModel的初始化方法
        super()._init_backbone(config)

        # 计算每个阶段的特征维度
        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
        
        # 创建SwinEmbeddings对象
        self.embeddings = SwinEmbeddings(config)
        
        # 创建SwinEncoder对象，使用patch_grid参数
        self.encoder = SwinEncoder(config, self.embeddings.patch_grid)

        # 为输出特征的隐藏状态添加层归一化层
        hidden_states_norms = {}
        for stage, num_channels in zip(self._out_features, self.channels):
            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        """
        获取输入嵌入的patch_embeddings。
        """
        return self.embeddings.patch_embeddings

    def forward(
        self,
        pixel_values: torch.Tensor,
        output_hidden_states: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
        """
        前向传播函数，接收以下参数：
        - pixel_values: 输入的像素值张量
        - output_hidden_states: 是否输出隐藏状态，默认为None
        - output_attentions: 是否输出注意力权重，默认为None
        - return_dict: 是否返回字典格式的输出，默认为None
        """
        # 省略部分前向传播代码...
        """
        返回BackboneOutput对象。

        返回：
            返回BackboneOutput对象，其中包含特征图、隐藏状态和注意力分数（如果有的话）。

        Examples:
        示例代码块，展示了如何使用该函数从图像提取特征图。
        """

        # 确定是否返回字典形式的结果，如果未指定，则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 确定是否返回隐藏状态，如果未指定，则使用配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否返回注意力分数，如果未指定，则使用配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

        # 使用输入的像素值进行嵌入处理，获取嵌入输出和输入维度信息
        embedding_output, input_dimensions = self.embeddings(pixel_values)

        # 使用编码器对嵌入输出进行编码，获取编码器的输出
        outputs = self.encoder(
            embedding_output,
            input_dimensions,
            head_mask=None,
            output_attentions=output_attentions,
            output_hidden_states=True,
            output_hidden_states_before_downsampling=True,
            always_partition=True,
            return_dict=True,
        )

        # 获取重塑后的隐藏状态
        hidden_states = outputs.reshaped_hidden_states

        # 初始化特征图空元组
        feature_maps = ()
        # 遍历阶段名称和对应的隐藏状态，生成特征图
        for stage, hidden_state in zip(self.stage_names, hidden_states):
            if stage in self.out_features:
                # 获取隐藏状态的形状信息
                batch_size, num_channels, height, width = hidden_state.shape
                # 重新排列维度，便于后续处理
                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
                # 调整形状以应用规范化
                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
                hidden_state = self.hidden_states_norms[stage](hidden_state)
                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
                # 将处理后的隐藏状态添加到特征图中
                feature_maps += (hidden_state,)

        # 如果不返回字典形式的结果，则返回特征图和可能的其他隐藏状态
        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs.hidden_states,)
            return output

        # 返回BackboneOutput对象，包括特征图、隐藏状态和注意力分数（如果有的话）
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=outputs.attentions,
        )

posted @ 2024-07-01 10:53 绝不原创的飞龙阅读(182) 评论(0) 收藏举报

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百零七-

Transformers 源码解析（一百零七）

`.\models\stablelm\init.py`

`.\models\starcoder2\configuration_starcoder2.py`

`.\models\starcoder2\modeling_starcoder2.py`

`.\models\starcoder2\init.py`

`.\models\superpoint\configuration_superpoint.py`

`.\models\superpoint\convert_superpoint_to_pytorch.py`

`.\models\superpoint\image_processing_superpoint.py`

`.\models\superpoint\modeling_superpoint.py`

`.\models\superpoint\init.py`

`.\models\swiftformer\configuration_swiftformer.py`

`.\models\swiftformer\convert_swiftformer_original_to_hf.py`

`.\models\swiftformer\modeling_swiftformer.py`

`.\models\swiftformer\init.py`

`.\models\swin\configuration_swin.py`

`.\models\swin\convert_swin_simmim_to_pytorch.py`

`.\models\swin\convert_swin_timm_to_pytorch.py`

`.\models\swin\modeling_swin.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百零七-

Transformers 源码解析（一百零七）

.\models\stablelm\__init__.py

.\models\starcoder2\configuration_starcoder2.py

.\models\starcoder2\modeling_starcoder2.py

.\models\starcoder2\__init__.py

.\models\superpoint\configuration_superpoint.py

.\models\superpoint\convert_superpoint_to_pytorch.py

.\models\superpoint\image_processing_superpoint.py

.\models\superpoint\modeling_superpoint.py

.\models\superpoint\__init__.py

.\models\swiftformer\configuration_swiftformer.py

.\models\swiftformer\convert_swiftformer_original_to_hf.py

.\models\swiftformer\modeling_swiftformer.py

.\models\swiftformer\__init__.py

.\models\swin\configuration_swin.py

.\models\swin\convert_swin_simmim_to_pytorch.py

.\models\swin\convert_swin_timm_to_pytorch.py

.\models\swin\modeling_swin.py

公告

`.\models\stablelm\init.py`

`.\models\starcoder2\configuration_starcoder2.py`

`.\models\starcoder2\modeling_starcoder2.py`

`.\models\starcoder2\init.py`

`.\models\superpoint\configuration_superpoint.py`

`.\models\superpoint\convert_superpoint_to_pytorch.py`

`.\models\superpoint\image_processing_superpoint.py`

`.\models\superpoint\modeling_superpoint.py`

`.\models\superpoint\init.py`

`.\models\swiftformer\configuration_swiftformer.py`

`.\models\swiftformer\convert_swiftformer_original_to_hf.py`

`.\models\swiftformer\modeling_swiftformer.py`

`.\models\swiftformer\init.py`

`.\models\swin\configuration_swin.py`

`.\models\swin\convert_swin_simmim_to_pytorch.py`

`.\models\swin\convert_swin_timm_to_pytorch.py`

`.\models\swin\modeling_swin.py`