Transformers-源码解析-十-

Transformers 源码解析(十)

.\models\albert\tokenization_albert_fast.py

# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""


import os
from shutil import copyfile
from typing import List, Optional, Tuple

from ...tokenization_utils import AddedToken
from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging


if is_sentencepiece_available():
    # 如果存在 sentencepiece 库,导入 AlbertTokenizer
    from .tokenization_albert import AlbertTokenizer
else:
    AlbertTokenizer = None

# 获取日志记录器
logger = logging.get_logger(__name__)
# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/spiece.model",
        "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/spiece.model",
        "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/spiece.model",
        "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/spiece.model",
        "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/spiece.model",
        "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/spiece.model",
        "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/spiece.model",
        "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/spiece.model",
    },
    # tokenizer_file 对于所有预训练模型都是 tokenizer.json
    "tokenizer_file": {
        "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/tokenizer.json",
        "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/tokenizer.json",
        "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/tokenizer.json",
        "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/tokenizer.json",
        "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/tokenizer.json",
        "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/tokenizer.json",
        "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/tokenizer.json",
        "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/tokenizer.json",
    },
}
    # 定义一个字典,包含不同版本的ALBERT模型名称和对应的分词器文件的URL
    "tokenizer_file": {
        "albert/albert-base-v1": "https://huggingface.co/albert/albert-base-v1/resolve/main/tokenizer.json",
        "albert/albert-large-v1": "https://huggingface.co/albert/albert-large-v1/resolve/main/tokenizer.json",
        "albert/albert-xlarge-v1": "https://huggingface.co/albert/albert-xlarge-v1/resolve/main/tokenizer.json",
        "albert/albert-xxlarge-v1": "https://huggingface.co/albert/albert-xxlarge-v1/resolve/main/tokenizer.json",
        "albert/albert-base-v2": "https://huggingface.co/albert/albert-base-v2/resolve/main/tokenizer.json",
        "albert/albert-large-v2": "https://huggingface.co/albert/albert-large-v2/resolve/main/tokenizer.json",
        "albert/albert-xlarge-v2": "https://huggingface.co/albert/albert-xlarge-v2/resolve/main/tokenizer.json",
        "albert/albert-xxlarge-v2": "https://huggingface.co/albert/albert-xxlarge-v2/resolve/main/tokenizer.json",
    },
}

# 定义一个空的类结束符号,用于关闭类的定义块

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "albert/albert-base-v1": 512,
    "albert/albert-large-v1": 512,
    "albert/albert-xlarge-v1": 512,
    "albert/albert-xxlarge-v1": 512,
    "albert/albert-base-v2": 512,
    "albert/albert-large-v2": 512,
    "albert/albert-xlarge-v2": 512,
    "albert/albert-xxlarge-v2": 512,
}

# 预训练模型位置编码嵌入大小的字典,包含不同ALBERT模型的名称及其对应的嵌入大小

SPIECE_UNDERLINE = "▁"

# 定义一个特殊符号,表示未分词的部分(用于分词模型的特殊符号)

class AlbertTokenizerFast(PreTrainedTokenizerFast):
    """
    构建一个“快速”ALBERT分词器(由HuggingFace的tokenizers库支持)。基于Unigram模型。
    该分词器继承自PreTrainedTokenizerFast类,包含大部分主要方法。用户可以参考这个超类获取更多关于这些方法的信息。
    """
    
    # 类的文档字符串,描述了这个类的作用和继承关系
    # 定义函数参数说明文档,用于指定 SentencePiece 文件,其中包含用于实例化分词器的词汇表
    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) 文件(通常具有 *.spm* 扩展名),包含实例化分词器所需的词汇表。
        do_lower_case (`bool`, *optional*, defaults to `True`):
            是否在分词时将输入转换为小写。
        remove_space (`bool`, *optional*, defaults to `True`):
            是否在分词时去除文本中的空格(去除字符串前后的多余空格)。
        keep_accents (`bool`, *optional*, defaults to `False`):
            是否在分词时保留重音符号。
        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
            在预训练期间用作序列开头的特殊标记。在构建使用特殊标记的序列时,实际上使用的标记是 `cls_token`。
        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
            序列结束的特殊标记。在构建使用特殊标记的序列时,实际使用的是 `sep_token`。
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            未知标记。词汇表中没有的标记将无法转换为 ID,并被设置为此标记。
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            分隔标记,用于从多个序列构建一个序列,例如用于序列分类或文本与问题之间的问答。也用作构建带有特殊标记的序列的最后一个标记。
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            填充标记,例如在对不同长度的序列进行批处理时使用。
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            分类器标记,在进行序列分类(整个序列而不是每个标记的分类)时使用。在使用特殊标记构建序列时,它是序列的第一个标记。
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            掩码值标记。用于掩码语言建模的训练中,模型将尝试预测此标记。
    """

    # 从全局变量中获取常量
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = AlbertTokenizer
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        remove_space=True,
        keep_accents=False,
        bos_token="[CLS]",
        eos_token="[SEP]",
        unk_token="<unk>",
        sep_token="[SEP]",
        pad_token="<pad>",
        cls_token="[CLS]",
        mask_token="[MASK]",
        **kwargs,
    ):
        # 初始化函数,用于实例化对象并设置初始属性值
        # 设置 mask_token,使其表现得像一个普通单词,包括前面的空格,并且在原始文本中也包含,应该在非规范化的句子中匹配。
        mask_token = (
            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
            if isinstance(mask_token, str)
            else mask_token
        )

        # 调用父类的初始化函数,传递相应的参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            remove_space=remove_space,
            keep_accents=keep_accents,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            **kwargs,
        )

        # 设置对象的属性值
        self.do_lower_case = do_lower_case
        self.remove_space = remove_space
        self.keep_accents = keep_accents
        self.vocab_file = vocab_file

    @property
    def can_save_slow_tokenizer(self) -> bool:
        # 检查是否可以保存缓慢的分词器,基于是否存在 vocab_file 文件
        return os.path.isfile(self.vocab_file) if self.vocab_file else False

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        从一个序列或一对序列构建模型输入,用于序列分类任务,通过连接和添加特殊标记。
        ALBERT 序列的格式如下:

        - 单个序列: `[CLS] X [SEP]`
        - 序列对: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                要添加特殊标记的 ID 列表
            token_ids_1 (`List[int]`, *可选*):
                第二个序列的 ID 列表,用于序列对

        Returns:
            `List[int]`: 带有适当特殊标记的输入 ID 列表
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        if token_ids_1 is None:
            return cls + token_ids_0 + sep
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ):
        # 根据输入的序列构建 token_type_ids,用于区分两个序列的标识
    ) -> List[int]:
        """
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define separator and class tokens
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # Check if token_ids_1 is None; return mask with only the first portion
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # Return full mask including both sequences
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Check if the fast tokenizer can save vocabulary; raise error if not
        if not self.can_save_slow_tokenizer:
            raise ValueError(
                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
                "tokenizer."
            )

        # Check if save_directory exists; log error and return if not
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        
        # Define output vocabulary file path
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # Copy vocabulary file if it's not already in the specified output path
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        # Return the path of the saved vocabulary file
        return (out_vocab_file,)

.\models\albert\__init__.py

# 导入所需的类型检查模块
from typing import TYPE_CHECKING

# 导入必要的依赖项和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构字典
_import_structure = {
    "configuration_albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig", "AlbertOnnxConfig"],
}

# 检查是否安装了 sentencepiece,若未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 sentencepiece,则将 AlbertTokenizer 添加到导入结构中
    _import_structure["tokenization_albert"] = ["AlbertTokenizer"]

# 检查是否安装了 tokenizers,若未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 tokenizers,则将 AlbertTokenizerFast 添加到导入结构中
    _import_structure["tokenization_albert_fast"] = ["AlbertTokenizerFast"]

# 检查是否安装了 torch,若未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 torch,则将 Albert 相关模块添加到导入结构中
    _import_structure["modeling_albert"] = [
        "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "AlbertForMaskedLM",
        "AlbertForMultipleChoice",
        "AlbertForPreTraining",
        "AlbertForQuestionAnswering",
        "AlbertForSequenceClassification",
        "AlbertForTokenClassification",
        "AlbertModel",
        "AlbertPreTrainedModel",
        "load_tf_weights_in_albert",
    ]

# 检查是否安装了 TensorFlow,若未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 TensorFlow,则将 TFAlbert 相关模块添加到导入结构中
    _import_structure["modeling_tf_albert"] = [
        "TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFAlbertForMaskedLM",
        "TFAlbertForMultipleChoice",
        "TFAlbertForPreTraining",
        "TFAlbertForQuestionAnswering",
        "TFAlbertForSequenceClassification",
        "TFAlbertForTokenClassification",
        "TFAlbertMainLayer",
        "TFAlbertModel",
        "TFAlbertPreTrainedModel",
    ]

# 检查是否安装了 Flax,若未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 Flax,继续添加相关模块(此处未完整给出,应根据实际情况补充)
    pass
    # 将一组模块名称添加到_import_structure字典中,以便后续导入
    _import_structure["modeling_flax_albert"] = [
        "FlaxAlbertForMaskedLM",                    # 添加FlaxAlbertForMaskedLM模块名
        "FlaxAlbertForMultipleChoice",              # 添加FlaxAlbertForMultipleChoice模块名
        "FlaxAlbertForPreTraining",                 # 添加FlaxAlbertForPreTraining模块名
        "FlaxAlbertForQuestionAnswering",           # 添加FlaxAlbertForQuestionAnswering模块名
        "FlaxAlbertForSequenceClassification",      # 添加FlaxAlbertForSequenceClassification模块名
        "FlaxAlbertForTokenClassification",         # 添加FlaxAlbertForTokenClassification模块名
        "FlaxAlbertModel",                          # 添加FlaxAlbertModel模块名
        "FlaxAlbertPreTrainedModel",                # 添加FlaxAlbertPreTrainedModel模块名
    ]
# 如果 TYPE_CHECKING 为真,则导入以下模块和类
if TYPE_CHECKING:
    # 导入 ALBERT 相关的配置映射和配置类
    from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig, AlbertOnnxConfig
    
    # 尝试检查是否安装了 sentencepiece,若未安装则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若安装了 sentencepiece,则导入 AlbertTokenizer
        from .tokenization_albert import AlbertTokenizer
    
    # 尝试检查是否安装了 tokenizers,若未安装则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若安装了 tokenizers,则导入 AlbertTokenizerFast
        from .tokenization_albert_fast import AlbertTokenizerFast
    
    # 尝试检查是否安装了 torch,若未安装则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若安装了 torch,则导入以下 Albert 相关模块和类
        from .modeling_albert import (
            ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            AlbertForMaskedLM,
            AlbertForMultipleChoice,
            AlbertForPreTraining,
            AlbertForQuestionAnswering,
            AlbertForSequenceClassification,
            AlbertForTokenClassification,
            AlbertModel,
            AlbertPreTrainedModel,
            load_tf_weights_in_albert,
        )
    
    # 尝试检查是否安装了 tensorflow,若未安装则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若安装了 tensorflow,则导入以下 TFAlbert 相关模块和类
        from .modeling_tf_albert import (
            TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFAlbertForMaskedLM,
            TFAlbertForMultipleChoice,
            TFAlbertForPreTraining,
            TFAlbertForQuestionAnswering,
            TFAlbertForSequenceClassification,
            TFAlbertForTokenClassification,
            TFAlbertMainLayer,
            TFAlbertModel,
            TFAlbertPreTrainedModel,
        )
    
    # 尝试检查是否安装了 flax,若未安装则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若安装了 flax,则导入以下 FlaxAlbert 相关模块和类
        from .modeling_flax_albert import (
            FlaxAlbertForMaskedLM,
            FlaxAlbertForMultipleChoice,
            FlaxAlbertForPreTraining,
            FlaxAlbertForQuestionAnswering,
            FlaxAlbertForSequenceClassification,
            FlaxAlbertForTokenClassification,
            FlaxAlbertModel,
            FlaxAlbertPreTrainedModel,
        )
# 如果 TYPE_CHECKING 为假,则导入 sys 模块,并将当前模块设为懒加载模块
else:
    import sys
    
    # 使用 _LazyModule 类将当前模块设置为懒加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\align\configuration_align.py

# coding=utf-8
# 声明文件的编码格式为UTF-8

# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
# 版权声明,保留所有权利

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 许可证版本 2.0 许可,除非符合许可协议要求,否则禁止使用该文件
# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非适用法律要求或书面同意,本软件按"原样"提供,不提供任何形式的明示或暗示保证或条件。
# 请参阅许可证了解更多信息。

""" ALIGN model configuration"""
# 模型配置的文档字符串注释

import os
# 导入标准库os

from typing import TYPE_CHECKING, List, Union
# 导入类型提示相关模块

if TYPE_CHECKING:
    pass
# 如果在类型检查环境下,不执行任何操作

from ...configuration_utils import PretrainedConfig
# 导入预训练配置工具中的PretrainedConfig类

from ...utils import logging
# 导入工具包中的logging模块

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "kakaobrain/align-base": "https://huggingface.co/kakaobrain/align-base/resolve/main/config.json",
}
# 预训练模型名称到配置文件地址的映射字典

class AlignTextConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`AlignTextModel`]. It is used to instantiate a
    ALIGN text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the ALIGN
    [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values here are
    copied from BERT.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Example:

    ```
    >>> from transformers import AlignTextConfig, AlignTextModel

    >>> # Initializing a AlignTextConfig with kakaobrain/align-base style configuration
    >>> configuration = AlignTextConfig()

    >>> # Initializing a AlignTextModel (with random weights) from the kakaobrain/align-base style configuration
    >>> model = AlignTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """
    # AlignTextConfig类的文档字符串注释,描述了如何配置和使用该类

    model_type = "align_text_model"
    # 模型类型为align_text_model

    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        use_cache=True,
        **kwargs,
    ):
        # 初始化函数,用于设置模型的各项配置参数

        super().__init__(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_hidden_layers,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
            hidden_act=hidden_act,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_position_embeddings=max_position_embeddings,
            type_vocab_size=type_vocab_size,
            initializer_range=initializer_range,
            layer_norm_eps=layer_norm_eps,
            pad_token_id=pad_token_id,
            position_embedding_type=position_embedding_type,
            use_cache=use_cache,
            **kwargs,
        )
        # 调用父类的初始化函数,设置模型配置参数
    ):
        super().__init__(**kwargs)
        # 调用父类的初始化方法,传入所有的关键字参数

        self.vocab_size = vocab_size
        # 设定词汇表大小

        self.hidden_size = hidden_size
        # 设定隐藏层大小

        self.num_hidden_layers = num_hidden_layers
        # 设定隐藏层数量

        self.num_attention_heads = num_attention_heads
        # 设定注意力头的数量

        self.hidden_act = hidden_act
        # 设定隐藏层激活函数

        self.intermediate_size = intermediate_size
        # 设定中间层大小

        self.hidden_dropout_prob = hidden_dropout_prob
        # 设定隐藏层的Dropout概率

        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 设定注意力概率的Dropout概率

        self.max_position_embeddings = max_position_embeddings
        # 设定最大位置嵌入数量

        self.type_vocab_size = type_vocab_size
        # 设定类型词汇表大小

        self.initializer_range = initializer_range
        # 设定初始化范围

        self.layer_norm_eps = layer_norm_eps
        # 设定LayerNormalization的epsilon值

        self.position_embedding_type = position_embedding_type
        # 设定位置嵌入类型

        self.use_cache = use_cache
        # 设定是否使用缓存

        self.pad_token_id = pad_token_id
        # 设定填充token的ID

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        cls._set_token_in_kwargs(kwargs)
        # 在关键字参数中设置token

        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
        # 获取预训练模型的配置字典和更新后的关键字参数

        # 如果从AlignConfig加载,则获取文本配置字典
        if config_dict.get("model_type") == "align":
            config_dict = config_dict["text_config"]
        # 如果配置字典中包含"model_type"且类中有"model_type"属性,并且它们不相等,发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        return cls.from_dict(config_dict, **kwargs)
        # 使用配置字典和关键字参数构建并返回预训练配置类实例
# 定义一个配置类 `AlignVisionConfig`,继承自 `PretrainedConfig`,用于存储 [`AlignVisionModel`] 的配置信息。
# 该类用于实例化一个 ALIGN 视觉编码器,根据指定的参数定义模型架构。
# 实例化一个带有默认值的配置将产生与 ALIGN 架构的视觉编码器类似的配置。
# 默认值来自 EfficientNet (efficientnet-b7)。
# 
# 配置对象继承自 [`PretrainedConfig`],可用于控制模型的输出。更多信息请参阅 [`PretrainedConfig`] 的文档。
class AlignVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`AlignVisionModel`]. It is used to instantiate a
    ALIGN vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the ALIGN
    [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values are copied
    from EfficientNet (efficientnet-b7)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 600):
            The input image size.
        width_coefficient (`float`, *optional*, defaults to 2.0):
            Scaling coefficient for network width at each stage.
        depth_coefficient (`float`, *optional*, defaults to 3.1):
            Scaling coefficient for network depth at each stage.
        depth_divisor (`int`, *optional*, defaults to 8):
            A unit of network width.
        kernel_sizes (`List[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`):
            List of kernel sizes to be used in each block.
        in_channels (`List[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`):
            List of input channel sizes to be used in each block for convolutional layers.
        out_channels (`List[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`):
            List of output channel sizes to be used in each block for convolutional layers.
        depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
            List of block indices with square padding.
        strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
            List of stride sizes to be used in each block for convolutional layers.
        num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
            List of the number of times each block is to be repeated.
        expand_ratios (`List[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`):
            List of scaling coefficients for each block.
        squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25):
            Squeeze expansion ratio.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in each block. Supported options are
            `"gelu"`, `"relu"`, `"selu"`, `"gelu_new"`, `"silu"`, and `"mish"`.
        hidden_dim (`int`, *optional*, defaults to 1280):
            The hidden dimension of the layer before the classification head.
        pooling_type (`str` or `function`, *optional*, defaults to `"mean"`):
            Type of final pooling to be applied before the dense classification head. Options are `"mean"` or `"max"`.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
        batch_norm_eps (`float`, *optional*, defaults to 1e-3):
            Epsilon value used by batch normalization layers.
        batch_norm_momentum (`float`, *optional*, defaults to 0.99):
            Momentum value used by batch normalization layers.
        drop_connect_rate (`float`, *optional*, defaults to 0.2):
            The drop rate for skip connections.

    Example:

    ```
    >>> from transformers import AlignVisionConfig, AlignVisionModel

    >>> # 使用 kakaobrain/align-base 风格的配置初始化 AlignVisionConfig
    >>> configuration = AlignVisionConfig()

    >>> # 使用 kakaobrain/align-base 风格的配置初始化一个带有随机权重的 AlignVisionModel
    >>> model = AlignVisionModel(configuration)

    >>> # 访问模型的配置信息
    >>> configuration = model.config



    model_type = "align_vision_model"

    def __init__(
        self,
        num_channels: int = 3,
        image_size: int = 600,
        width_coefficient: float = 2.0,
        depth_coefficient: float = 3.1,
        depth_divisor: int = 8,
        kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3],
        in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192],
        out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320],
        depthwise_padding: List[int] = [],
        strides: List[int] = [1, 2, 2, 2, 1, 2, 1],
        num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1],
        expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6],
        squeeze_expansion_ratio: float = 0.25,
        hidden_act: str = "swish",
        hidden_dim: int = 2560,
        pooling_type: str = "mean",
        initializer_range: float = 0.02,
        batch_norm_eps: float = 0.001,
        batch_norm_momentum: float = 0.99,
        drop_connect_rate: float = 0.2,
        **kwargs,
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 初始化模型的各种参数
        self.num_channels = num_channels
        self.image_size = image_size
        self.width_coefficient = width_coefficient
        self.depth_coefficient = depth_coefficient
        self.depth_divisor = depth_divisor
        self.kernel_sizes = kernel_sizes
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.depthwise_padding = depthwise_padding
        self.strides = strides
        self.num_block_repeats = num_block_repeats
        self.expand_ratios = expand_ratios
        self.squeeze_expansion_ratio = squeeze_expansion_ratio
        self.hidden_act = hidden_act
        self.hidden_dim = hidden_dim
        self.pooling_type = pooling_type
        self.initializer_range = initializer_range
        self.batch_norm_eps = batch_norm_eps
        self.batch_norm_momentum = batch_norm_momentum
        self.drop_connect_rate = drop_connect_rate
        self.num_hidden_layers = sum(num_block_repeats) * 4

    @classmethod
    # 类方法,用于从预训练模型名称或路径加载预训练配置,返回一个预训练配置对象
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 调用内部方法,将token设置到kwargs中
        cls._set_token_in_kwargs(kwargs)

        # 调用类方法获取配置字典和更新后的kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果配置字典中的模型类型为"align",则从中获取视觉配置字典
        if config_dict.get("model_type") == "align":
            config_dict = config_dict["vision_config"]

        # 如果配置字典中有"model_type"键且当前类有"model_type"属性,并且二者不相等,发出警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 调用类方法,根据配置字典创建预训练配置对象并返回
        return cls.from_dict(config_dict, **kwargs)
# `AlignConfig` 是用来存储 [`AlignModel`] 的配置信息的类。
class AlignConfig(PretrainedConfig):
    r"""
    [`AlignConfig`] is the configuration class to store the configuration of a [`AlignModel`]. It is used to
    instantiate a ALIGN model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the ALIGN
    [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`AlignTextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`AlignVisionConfig`].
        projection_dim (`int`, *optional*, defaults to 640):
            Dimentionality of text and vision projection layers.
        temperature_init_value (`float`, *optional*, defaults to 1.0):
            The inital value of the *temperature* paramter. Default is used as per the original ALIGN implementation.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import AlignConfig, AlignModel

    >>> # Initializing a AlignConfig with kakaobrain/align-base style configuration
    >>> configuration = AlignConfig()

    >>> # Initializing a AlignModel (with random weights) from the kakaobrain/align-base style configuration
    >>> model = AlignModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a AlignConfig from a AlignTextConfig and a AlignVisionConfig
    >>> from transformers import AlignTextConfig, AlignVisionConfig

    >>> # Initializing ALIGN Text and Vision configurations
    >>> config_text = AlignTextConfig()
    >>> config_vision = AlignVisionConfig()

    >>> config = AlignConfig.from_text_vision_configs(config_text, config_vision)
    ```"""
    
    # 类属性,标识模型类型为 "align"
    model_type = "align"

    # 初始化方法,接收多个参数来配置模型
    def __init__(
        self,
        text_config=None,               # 文本配置的字典,用于初始化 `AlignTextConfig`
        vision_config=None,             # 视觉配置的字典,用于初始化 `AlignVisionConfig`
        projection_dim=640,             # 文本和视觉投影层的维度
        temperature_init_value=1.0,     # 温度参数的初始值,默认为 1.0
        initializer_range=0.02,         # 所有权重矩阵初始化的截断正态分布标准差
        **kwargs,                       # 其他关键字参数
    ):
        super().__init__(**kwargs)
        # 调用父类的初始化方法,传入所有的关键字参数

        if text_config is None:
            text_config = {}
            logger.info("text_config is None. Initializing the AlignTextConfig with default values.")
            # 如果文本配置为空,使用空字典,并记录日志,使用默认值初始化 AlignTextConfig

        if vision_config is None:
            vision_config = {}
            logger.info("vision_config is None. Initializing the AlignVisionConfig with default values.")
            # 如果视觉配置为空,使用空字典,并记录日志,使用默认值初始化 AlignVisionConfig

        self.text_config = AlignTextConfig(**text_config)
        # 使用传入的文本配置参数初始化 AlignTextConfig 对象,并赋值给实例变量 self.text_config

        self.vision_config = AlignVisionConfig(**vision_config)
        # 使用传入的视觉配置参数初始化 AlignVisionConfig 对象,并赋值给实例变量 self.vision_config

        self.projection_dim = projection_dim
        # 将传入的 projection_dim 参数赋值给实例变量 self.projection_dim

        self.temperature_init_value = temperature_init_value
        # 将传入的 temperature_init_value 参数赋值给实例变量 self.temperature_init_value

        self.initializer_range = initializer_range
        # 将传入的 initializer_range 参数赋值给实例变量 self.initializer_range

    @classmethod
    def from_text_vision_configs(cls, text_config: AlignTextConfig, vision_config: AlignVisionConfig, **kwargs):
        r"""
        Instantiate a [`AlignConfig`] (or a derived class) from align text model configuration and align vision model
        configuration.

        Returns:
            [`AlignConfig`]: An instance of a configuration object
        """
        # 类方法:根据文本和视觉配置实例化一个 AlignConfig(或其派生类)对象

        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
        # 返回使用文本和视觉配置对象的字典形式初始化的 AlignConfig 对象,同时传入其他关键字参数

.\models\align\convert_align_tf_to_hf.py

# 设置文件编码为 UTF-8
# 版权声明,指明版权归 HuggingFace Inc. 团队所有,使用 Apache License, Version 2.0 许可
# 详细许可信息可在 http://www.apache.org/licenses/LICENSE-2.0 获取
# 根据适用法律和书面同意,在未经许可的情况下不得使用此文件
"""从原始存储库转换 ALIGN 检查点。"""

# 导入所需的库和模块
import argparse  # 用于解析命令行参数
import os  # 用于操作系统相关功能

import align  # 导入 align 模块
import numpy as np  # 用于数值计算
import requests  # 用于发出 HTTP 请求
import tensorflow as tf  # TensorFlow 深度学习框架
import torch  # PyTorch 深度学习框架
from PIL import Image  # Python Imaging Library,用于图像处理
from tokenizer import Tokenizer  # 导入自定义的 Tokenizer

# 导入 transformers 库的相关模块和函数
from transformers import (
    AlignConfig,  # ALIGN 模型的配置类
    AlignModel,  # ALIGN 模型类
    AlignProcessor,  # ALIGN 模型的处理器类
    BertConfig,  # BERT 模型的配置类
    BertTokenizer,  # BERT 模型的分词器类
    EfficientNetConfig,  # EfficientNet 模型的配置类
    EfficientNetImageProcessor,  # EfficientNet 图像处理器类
)

from transformers.utils import logging  # 导入 transformers 库的日志记录功能

# 设置日志记录的详细程度为 info
logging.set_verbosity_info()
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


# 对输入图像进行预处理,将其调整大小并进行裁剪
def preprocess(image):
    image = tf.image.resize(image, (346, 346))  # 调整图像大小为 346x346 像素
    image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289)
    return image


# 获取 ALIGN 模型的配置
def get_align_config():
    # 使用预训练的 EfficientNet-B7 配置
    vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7")
    vision_config.image_size = 289  # 设置图像输入大小为 289 像素
    vision_config.hidden_dim = 640  # 设置隐藏层维度为 640
    vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"}  # 标签映射字典
    vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1}  # 反向标签映射字典
    vision_config.depthwise_padding = []  # 深度可分卷积填充方式为空列表

    text_config = BertConfig()  # 使用 BERT 配置
    # 根据文本和视觉配置创建 ALIGN 模型的配置对象,投影维度为 640
    config = AlignConfig.from_text_vision_configs(
        text_config=text_config, vision_config=vision_config, projection_dim=640
    )
    return config


# 准备图像数据,使用 COCO 数据集中的一张图像
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    im = Image.open(requests.get(url, stream=True).raw)  # 从 URL 打开图像文件
    return im


# 获取处理器,包括图像处理器和分词器
def get_processor():
    # 使用 EfficientNet 图像处理器进行图像预处理
    image_processor = EfficientNetImageProcessor(
        do_center_crop=True,
        rescale_factor=1 / 127.5,
        rescale_offset=True,
        do_normalize=False,
        include_top=False,
        resample=Image.BILINEAR,
    )
    tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")  # 使用 BERT 分词器
    tokenizer.model_max_length = 64  # 设置最大模型长度为 64
    # 创建 ALIGN 模型处理器,传入图像处理器和分词器
    processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer)
    return processor


# 列出需要重命名的所有键(左边是原始名称,右边是我们的名称)
def rename_keys(original_param_names):
    # 获取 EfficientNet 图像编码器的块名称列表
    block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")]
    block_names = list(set(block_names))  # 去重
    block_names = sorted(block_names)  # 排序
    num_blocks = len(block_names)  # 获取块的数量
    # 创建块名称到序号的映射字典
    block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))}
    # 创建一个空列表,用于存储需要重命名的键值对元组
    rename_keys = []
    # 添加元组到列表,将指定的模型权重名称映射到新的命名结构
    rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight"))
    rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight"))
    rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias"))
    rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean"))
    rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var"))

    # 遍历block_names列表中的每个元素,生成重命名后的键值对元组
    for b in block_names:
        hf_b = block_name_mapping[b]
        rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight"))
        rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight"))
        rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias"))
        rename_keys.append(
            (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean")
        )
        rename_keys.append(
            (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var")
        )
        rename_keys.append(
            (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight")
        )
        rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight"))
        rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias"))
        rename_keys.append(
            (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean")
        )
        rename_keys.append(
            (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var")
        )

        rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight"))
        rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias"))
        rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight"))
        rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias"))
        rename_keys.append(
            (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight")
        )
        rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight"))
        rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias"))
        rename_keys.append(
            (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean")
        )
        rename_keys.append(
            (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var")
        )

    # 创建一个空字典,用于存储键名的映射关系
    key_mapping = {}
    # 遍历重命名键列表中的每个项
    for item in rename_keys:
        # 如果当前项的第一个元素存在于原始参数名列表中
        if item[0] in original_param_names:
            # 将原始参数名映射到新的键,以"vision_model." + 第二个元素作为值
            key_mapping[item[0]] = "vision_model." + item[1]

    # BERT 文本编码器的重命名列表初始化为空
    rename_keys = []
    # 定义旧模型路径
    old = "tf_bert_model/bert"
    # 定义新模型路径
    new = "text_model"
    # 添加重命名对,将旧路径中的特定参数映射到新路径的对应参数上
    rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight"))
    rename_keys.append((f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight"))
    rename_keys.append((f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight"))
    rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight"))
    rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias"))

    rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight"))
    rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias"))
    rename_keys.append(("dense/kernel:0", "text_projection.weight"))
    rename_keys.append(("dense/bias:0", "text_projection.bias"))
    rename_keys.append(("dense/bias:0", "text_projection.bias"))
    rename_keys.append(("temperature:0", "temperature"))

    # 遍历重命名键列表中的每个项
    for item in rename_keys:
        # 如果当前项的第一个元素存在于原始参数名列表中
        if item[0] in original_param_names:
            # 将原始参数名映射到新的键,以当前项的第二个元素作为值
            key_mapping[item[0]] = item[1]
    # 返回最终的键映射字典
    return key_mapping
# 加载原始模型和相关依赖
seq_length = 64
tok = Tokenizer(seq_length)
original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size())
original_model.compile()
original_model.load_weights(checkpoint_path)

# 获取可训练和不可训练的 TensorFlow 参数
tf_params = original_model.trainable_variables
tf_non_train_params = original_model.non_trainable_variables
tf_params = {param.name: param.numpy() for param in tf_params}
for param in tf_non_train_params:
    tf_params[param.name] = param.numpy()
tf_param_names = list(tf_params.keys())

# 加载 HuggingFace 模型配置和状态字典
config = get_align_config()
hf_model = AlignModel(config).eval()
hf_params = hf_model.state_dict()

# 创建源到目标参数名称映射字典
print("Converting parameters...")
key_mapping = rename_keys(tf_param_names)
replace_params(hf_params, tf_params, key_mapping)

# 初始化处理器
processor = get_processor()

# 准备输入数据
inputs = processor(
    images=prepare_img(),
    text="A picture of a cat",
    padding="max_length",
    max_length=64,
    return_tensors="pt"
)

# 在 HuggingFace 模型上进行推理
hf_model.eval()
with torch.no_grad():
    outputs = hf_model(**inputs)

# 提取 HuggingFace 模型的图像和文本特征
hf_image_features = outputs.image_embeds.detach().numpy()
hf_text_features = outputs.text_embeds.detach().numpy()

# 在原始模型上进行推理
original_model.trainable = False
tf_image_processor = EfficientNetImageProcessor(
    do_center_crop=True,
    do_rescale=False,
    do_normalize=False,
    include_top=False,
    resample=Image.BILINEAR,
)
image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"]
text = tok(tf.constant(["A picture of a cat"]))
    # 使用原始模型的图像编码器生成图像特征,设置为不训练状态
    image_features = original_model.image_encoder(image, training=False)
    # 使用原始模型的文本编码器生成文本特征,设置为不训练状态
    text_features = original_model.text_encoder(text, training=False)

    # 对图像特征进行 L2 归一化
    image_features = tf.nn.l2_normalize(image_features, axis=-1)
    # 对文本特征进行 L2 归一化
    text_features = tf.nn.l2_normalize(text_features, axis=-1)

    # 检查原始模型和HF模型的输出是否匹配,使用 np.allclose 函数进行比较,允许的最大误差为 1e-3
    if not np.allclose(image_features, hf_image_features, atol=1e-3):
        raise ValueError("The predicted image features are not the same.")
    if not np.allclose(text_features, hf_text_features, atol=1e-3):
        raise ValueError("The predicted text features are not the same.")
    # 输出匹配成功的消息
    print("Model outputs match!")

    if save_model:
        # 如果需要保存模型,创建保存模型的文件夹
        if not os.path.isdir(pytorch_dump_folder_path):
            os.mkdir(pytorch_dump_folder_path)
        # 将转换后的 HF 模型和处理器保存到指定路径
        hf_model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)

    if push_to_hub:
        # 如果需要推送到 Hub,打印推送信息
        print("Pushing converted ALIGN to the hub...")
        # 将模型和处理器推送到 Hub 上,使用 "align-base" 作为标识
        processor.push_to_hub("align-base")
        hf_model.push_to_hub("align-base")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行,则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必需的参数
    parser.add_argument(
        "--checkpoint_path",
        default="./weights/model-weights",
        type=str,
        help="Path to the pretrained TF ALIGN checkpoint."
    )
    # 添加名为--checkpoint_path的参数,用于指定预训练的 TF ALIGN 检查点路径,默认为"./weights/model-weights"

    parser.add_argument(
        "--pytorch_dump_folder_path",
        default="hf_model",
        type=str,
        help="Path to the output PyTorch model directory."
    )
    # 添加名为--pytorch_dump_folder_path的参数,用于指定输出的 PyTorch 模型目录路径,默认为"hf_model"

    parser.add_argument("--save_model", action="store_true", help="Save model to local")
    # 添加一个标志参数--save_model,如果设置则表示要将模型保存到本地

    parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub")
    # 添加一个标志参数--push_to_hub,如果设置则表示要将模型和图像处理器推送到Hub

    # 解析命令行参数
    args = parser.parse_args()

    # 调用函数convert_align_checkpoint,传递解析后的参数作为参数传递给函数
    convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub)

.\models\align\modeling_align.py

# coding=utf-8
# Copyright 2023 The Google Research Team Authors and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch ALIGN model."""

import math  # 导入数学函数库
from dataclasses import dataclass  # 导入用于创建数据类的装饰器
from typing import Any, Optional, Tuple, Union  # 导入类型提示相关库

import torch  # 导入PyTorch深度学习库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint模块
from torch import nn  # 导入神经网络模块

from ...activations import ACT2FN  # 导入激活函数相关定义
from ...modeling_outputs import (
    BaseModelOutputWithNoAttention,  # 导入无注意力机制的基础模型输出
    BaseModelOutputWithPastAndCrossAttentions,  # 导入带过去和交叉注意力的基础模型输出
    BaseModelOutputWithPoolingAndCrossAttentions,  # 导入带池化和交叉注意力的基础模型输出
    BaseModelOutputWithPoolingAndNoAttention,  # 导入带池化但无注意力机制的基础模型输出
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型工具函数
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer  # 导入PyTorch工具函数
from ...utils import (
    ModelOutput,  # 导入模型输出定义
    add_start_docstrings,  # 导入添加文档字符串的函数
    add_start_docstrings_to_model_forward,  # 导入添加模型前向传播文档字符串的函数
    logging,  # 导入日志记录功能
    replace_return_docstrings,  # 导入替换返回文档字符串的函数
)
from .configuration_align import AlignConfig, AlignTextConfig, AlignVisionConfig  # 导入配置类

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

_CHECKPOINT_FOR_DOC = "kakaobrain/align-base"  # 预训练模型的检查点名称
_CONFIG_FOR_DOC = "AlignConfig"  # 配置类的名称


ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "kakaobrain/align-base",  # 预训练模型存档列表,包括基础模型
    # 查看所有ALIGN模型:https://huggingface.co/models?filter=align
]


ALIGN_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`AlignConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

ALIGN_TEXT_INPUTS_DOCSTRING = r"""
    Parameters:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input tokens in the vocabulary.
            Indices can be obtained using :class:`~transformers.ALIGNTokenizer`.
            See :class:`~transformers.PreTrainedTokenizer` for more information.
        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            See :class:`~transformers.ALIGNTokenizer` for more information.
        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Segment token indices to indicate first and second portions of the inputs.
            Indices are selected in ``[0, 1]``:
            - 0 corresponds to a `sentence A` token,
            - 1 corresponds to a `sentence B` token.
            See :class:`~transformers.ALIGNTokenizer` for more information.
        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, optional):
            Indices of positions of each input token in the position embeddings.
            Selected in the range ``[0, config.max_position_embeddings - 1]``.
            See :class:`~transformers.ALIGNTokenizer` for more information.
        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, optional):
            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert input tokens into embeddings before feeding them
            to the model.
        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, optional):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
            See :func:`~transformers.modeling_utils.create_mask_from_input_mask` for more information.
        inputs_kwargs:
            Additional dictionary of keyword arguments for specific settings of the model encoder (usually for adding
            special features in model-specific encoders).

    Returns:
        :class:`~transformers.ModelOutput`: A dictionary (if the model has more than one output) or a single tensor
        (if the model has only one output) with the model outputs.
"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列的词汇索引。默认情况下会忽略填充部分。
            # 可以使用 AutoTokenizer 获取索引。详见 PreTrainedTokenizer.encode 和 PreTrainedTokenizer.__call__。

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩,用于在填充的令牌索引上避免注意力操作。取值范围为 `[0, 1]`:

            - 1 表示**未遮罩**的令牌,
            - 0 表示**遮罩**的令牌。

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 每个输入序列令牌在位置嵌入中的位置索引。取值范围为 `[0, config.max_position_embeddings - 1]`。

            [What are position IDs?](../glossary#position-ids)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            # 分段令牌索引,用于指示输入的第一部分和第二部分。索引选取范围为 `[0, 1]`:

            - 0 对应 *句子 A* 的令牌,
            - 1 对应 *句子 B* 的令牌。

            [What are token type IDs?](../glossary#token-type-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            # 用于屏蔽自注意力模块中选定头部的掩码。取值范围为 `[0, 1]`:

            - 1 表示**未遮罩**的头部,
            - 0 表示**遮罩**的头部。

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            # 可选地,可以直接传递嵌入表示而不是传递 `input_ids`。如果您希望更精确地控制如何将 `input_ids` 索引转换为相关联的向量,则这很有用,而不是使用模型的内部嵌入查找矩阵。
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详见返回的张量中的 `attentions` 获取更多细节。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详见返回的张量中的 `hidden_states` 获取更多细节。
        return_dict (`bool`, *optional*):
            # 是否返回一个 `~utils.ModelOutput` 而不是一个普通的元组。
# ALIGN_VISION_INPUTS_DOCSTRING 是一个原始字符串,用于描述 AlignVisionModel 类中 align_vision 模块的输入参数和返回值
ALIGN_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
            [`AutoImageProcessor`]. See [`EfficientNetImageProcessor.__call__`] for details.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

# ALIGN_INPUTS_DOCSTRING 是一个空字符串,可能用于将来扩展
ALIGN_INPUTS_DOCSTRING = r"""
"""


@dataclass
class AlignVisionModelOutput(ModelOutput):
    """
    AlignVisionModelOutput 是一个数据类,用于存储视觉模型输出,包含图像嵌入和最后一层隐藏状态的汇总。

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    """

    image_embeds: Optional[torch.FloatTensor] = None
    last_hidden_state: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None


@dataclass
class AlignTextModelOutput(ModelOutput):
    """
    AlignTextModelOutput 是一个数据类,用于存储文本模型的输出,包含最后一层隐藏状态的汇总。

    Base class for text model's outputs that also contains a pooling of the last hidden states.
    """
    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # Optional: Text embeddings of shape (batch_size, output_dim) if projection layer is used.
    text_embeds: Optional[torch.FloatTensor] = None
    # Required: Hidden states of shape (batch_size, sequence_length, hidden_size) from the last model layer.
    last_hidden_state: torch.FloatTensor = None
    # Optional: Tuple of hidden states from all layers, including embeddings if present, of shape (batch_size, sequence_length, hidden_size).
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # Optional: Tuple of attention weights for each layer of shape (batch_size, num_heads, sequence_length, sequence_length).
    attentions: Optional[Tuple[torch.FloatTensor]] = None
@dataclass
class AlignOutput(ModelOutput):
    """
    AlignOutput 类,继承自 ModelOutput,用于保存对齐模型的输出结果。

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            对比损失,用于衡量图像-文本的相似度。
        logits_per_image: (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            图像嵌入向量与文本嵌入向量之间的点积得分。表示图像-文本之间的相似度分数。
        logits_per_text: (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            文本嵌入向量与图像嵌入向量之间的点积得分。表示文本-图像之间的相似度分数。
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            通过投影层应用到 [`AlignTextModel`] 的汇总输出得到的文本嵌入向量。
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`):
            [`AlignVisionModel`] 的输出。
        text_model_output (`BaseModelOutputWithPoolingAndCrossAttentions`):
            [`AlignTextModel`] 的输出,包含池化和交叉注意力。
        vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
            [`AlignVisionModel`] 的输出,包含池化但没有注意力。
    """

    loss: Optional[torch.FloatTensor] = None
    logits_per_image: torch.FloatTensor = None
    logits_per_text: torch.FloatTensor = None
    text_embeds: torch.FloatTensor = None
    image_embeds: torch.FloatTensor = None
    text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
    vision_model_output: BaseModelOutputWithPoolingAndNoAttention = None

    def to_tuple(self) -> Tuple[Any]:
        """
        将对象转换为元组形式,用于序列化。

        Returns:
            Tuple[Any]: 对象的元组表示。
        """
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )


# 对比损失函数,从 https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html 改编而来
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    """
    计算对比损失。

    Args:
        logits (torch.Tensor): 输入的 logits 张量。

    Returns:
        torch.Tensor: 计算得到的对比损失。
    """
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device), label_smoothing=0.1)


def align_loss(similarity: torch.Tensor) -> torch.Tensor:
    """
    计算对齐损失,结合了对比损失函数的结果。

    Args:
        similarity (torch.Tensor): 图像-文本或文本-图像之间的相似度矩阵。

    Returns:
        torch.Tensor: 计算得到的对齐损失。
    """
    caption_loss = contrastive_loss(similarity)
    image_loss = contrastive_loss(similarity.t())
    return (caption_loss + image_loss) / 2.0


# 从 transformers.models.efficientnet.modeling_efficientnet.round_filters 复制而来,用于 AlignVision
def round_filters(config: AlignVisionConfig, num_channels: int):
    """
    根据深度乘数调整滤波器数量。

    Args:
        config (AlignVisionConfig): 包含配置信息的对象,如深度因子。
        num_channels (int): 当前的通道数量。

    Returns:
        int: 调整后的通道数量。
    """
    divisor = config.depth_divisor
    num_channels *= config.width_coefficient
    new_dim = max(divisor, int(num_channels + divisor / 2) // divisor * divisor)

    # 确保向下舍入不会降低超过 10%。
    if new_dim < 0.9 * num_channels:
        new_dim += divisor

    return int(new_dim)
# Copied from transformers.models.efficientnet.modeling_efficientnet.correct_pad
# 定义一个函数,用于计算深度卷积的填充值
def correct_pad(kernel_size: Union[int, Tuple], adjust: bool = True):
    """
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    """
    # 如果 `kernel_size` 是 `int` 类型,则转换为元组形式
    if isinstance(kernel_size, int):
        kernel_size = (kernel_size, kernel_size)

    # 计算正确的填充值,使得深度卷积的输出尺寸与输入尺寸相同
    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
    if adjust:
        # 如果需要调整填充值,则返回调整后的填充元组
        return (correct[1] - 1, correct[1], correct[0] - 1, correct[0])
    else:
        # 否则返回未调整的填充元组
        return (correct[1], correct[1], correct[0], correct[0])


# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetEmbeddings with EfficientNet->AlignVision
# 定义一个用于视觉对齐的嵌入模块,类似于原始实现中的干节点模块
class AlignVisionEmbeddings(nn.Module):
    """
    A module that corresponds to the stem module of the original work.
    """

    def __init__(self, config: AlignVisionConfig):
        super().__init__()

        # 计算输出维度,根据配置文件中的信息
        self.out_dim = round_filters(config, 32)
        # 添加零填充层,填充的方式为 (0, 1, 0, 1)
        self.padding = nn.ZeroPad2d(padding=(0, 1, 0, 1))
        # 定义二维卷积层,用于提取特征
        self.convolution = nn.Conv2d(
            config.num_channels, self.out_dim, kernel_size=3, stride=2, padding="valid", bias=False
        )
        # 批归一化层,用于规范化数据分布
        self.batchnorm = nn.BatchNorm2d(self.out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum)
        # 激活函数,根据配置文件中指定的激活函数类型选择
        self.activation = ACT2FN[config.hidden_act]

    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
        # 对输入的像素值进行填充
        features = self.padding(pixel_values)
        # 应用卷积操作
        features = self.convolution(features)
        # 应用批归一化
        features = self.batchnorm(features)
        # 应用激活函数
        features = self.activation(features)

        return features


# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetDepthwiseConv2d with EfficientNet->AlignVision
# 定义一个深度卷积层,继承自 PyTorch 的二维卷积层
class AlignVisionDepthwiseConv2d(nn.Conv2d):
    def __init__(
        self,
        in_channels,
        depth_multiplier=1,
        kernel_size=3,
        stride=1,
        padding=0,
        dilation=1,
        bias=True,
        padding_mode="zeros",
    ):
        # 计算输出通道数,根据输入通道数和深度倍增因子
        out_channels = in_channels * depth_multiplier
        # 调用父类的初始化方法,定义深度卷积层
        super().__init__(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=in_channels,  # 使用组卷积,每个输入通道对应一个卷积核
            bias=bias,
            padding_mode=padding_mode,
        )


# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetExpansionLayer with EfficientNet->AlignVision
# 定义一个扩展层模块,对应原始实现中每个块的扩展阶段
class AlignVisionExpansionLayer(nn.Module):
    """
    This corresponds to the expansion phase of each block in the original implementation.
    """
    # 初始化函数,用于创建一个卷积神经网络模块
    def __init__(self, config: AlignVisionConfig, in_dim: int, out_dim: int, stride: int):
        super().__init__()
        # 定义一个1x1的卷积层,用于扩展输入通道数到输出通道数
        self.expand_conv = nn.Conv2d(
            in_channels=in_dim,
            out_channels=out_dim,
            kernel_size=1,
            padding="same",  # 设定填充方式为 "same",即保持输入输出尺寸相同
            bias=False,  # 不使用偏置项
        )
        # 定义扩展后的批归一化层,对输出通道数进行归一化处理
        self.expand_bn = nn.BatchNorm2d(num_features=out_dim, eps=config.batch_norm_eps)
        # 根据配置选择激活函数,ACT2FN 是一个预定义的激活函数字典
        self.expand_act = ACT2FN[config.hidden_act]

    # 前向传播函数
    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 执行扩展阶段的前向传播
        # 将输入的 hidden_states 通过扩展卷积层进行卷积操作
        hidden_states = self.expand_conv(hidden_states)
        # 对卷积结果进行扩展批归一化处理
        hidden_states = self.expand_bn(hidden_states)
        # 应用预定义的激活函数到批归一化后的结果
        hidden_states = self.expand_act(hidden_states)

        # 返回经过扩展阶段处理后的隐藏状态数据
        return hidden_states
# 从 EfficientNet 的模型定义中复制而来,用于实现 AlignVision 的深度卷积层
class AlignVisionDepthwiseLayer(nn.Module):
    r"""
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    """

    def __init__(
        self,
        config: AlignVisionConfig,
        in_dim: int,
        stride: int,
        kernel_size: int,
        adjust_padding: bool,
    ):
        super().__init__()
        self.stride = stride
        # 根据步长选择是否使用 'valid' 或 'same' 的填充方式
        conv_pad = "valid" if self.stride == 2 else "same"
        # 计算正确的填充大小
        padding = correct_pad(kernel_size, adjust=adjust_padding)

        # 深度卷积层的零填充
        self.depthwise_conv_pad = nn.ZeroPad2d(padding=padding)
        # 深度卷积层的定义
        self.depthwise_conv = AlignVisionDepthwiseConv2d(
            in_dim, kernel_size=kernel_size, stride=stride, padding=conv_pad, bias=False
        )
        # 批归一化层
        self.depthwise_norm = nn.BatchNorm2d(
            num_features=in_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
        )
        # 激活函数的选择
        self.depthwise_act = ACT2FN[config.hidden_act]

    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 深度卷积操作
        if self.stride == 2:
            hidden_states = self.depthwise_conv_pad(hidden_states)

        hidden_states = self.depthwise_conv(hidden_states)
        hidden_states = self.depthwise_norm(hidden_states)
        hidden_states = self.depthwise_act(hidden_states)

        return hidden_states


# 从 EfficientNet 的模型定义中复制而来,用于实现 AlignVision 的挤压激活层
class AlignVisionSqueezeExciteLayer(nn.Module):
    r"""
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    """

    def __init__(self, config: AlignVisionConfig, in_dim: int, expand_dim: int, expand: bool = False):
        super().__init__()
        # 根据是否扩展选择维度
        self.dim = expand_dim if expand else in_dim
        # 计算挤压激活层的输出维度
        self.dim_se = max(1, int(in_dim * config.squeeze_expansion_ratio))

        # 挤压阶段:全局平均池化
        self.squeeze = nn.AdaptiveAvgPool2d(output_size=1)
        # 激活阶段:降维卷积
        self.reduce = nn.Conv2d(
            in_channels=self.dim,
            out_channels=self.dim_se,
            kernel_size=1,
            padding="same",
        )
        # 激活阶段:扩展卷积
        self.expand = nn.Conv2d(
            in_channels=self.dim_se,
            out_channels=self.dim,
            kernel_size=1,
            padding="same",
        )
        # 降维卷积后的激活函数
        self.act_reduce = ACT2FN[config.hidden_act]
        # 扩展卷积后的激活函数
        self.act_expand = nn.Sigmoid()
    # 定义前向传播方法,接受隐藏状态作为输入并返回张量
    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 将输入赋给局部变量inputs
        inputs = hidden_states
        # 使用squeeze方法对隐藏状态进行压缩操作
        hidden_states = self.squeeze(hidden_states)
        # 使用reduce方法对压缩后的隐藏状态进行进一步处理
        hidden_states = self.reduce(hidden_states)
        # 对进一步处理后的隐藏状态应用激活函数
        hidden_states = self.act_reduce(hidden_states)

        # 使用expand方法对处理后的隐藏状态进行扩展操作
        hidden_states = self.expand(hidden_states)
        # 对扩展后的隐藏状态应用激活函数
        hidden_states = self.act_expand(hidden_states)
        # 将原始输入与扩展后的隐藏状态进行逐元素乘法操作
        hidden_states = torch.mul(inputs, hidden_states)

        # 返回经过处理后的隐藏状态张量
        return hidden_states
        super().__init__()
        # 初始化函数,调用父类的初始化方法

        self.apply_dropout = stride == 1 and not id_skip
        # 根据参数确定是否应用 dropout,条件是 stride 为 1 且 id_skip 为 False

        self.project_conv = nn.Conv2d(
            in_channels=in_dim,
            out_channels=out_dim,
            kernel_size=1,
            padding="same",
            bias=False,
        )
        # 创建一个卷积层,用于将输入通道数 in_dim 转换为输出通道数 out_dim,
        # 使用 1x1 的卷积核,padding 设置为 "same",不包含偏置项

        self.project_bn = nn.BatchNorm2d(
            num_features=out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum
        )
        # 创建一个批归一化层,对输出通道数为 out_dim 的特征图进行批归一化,
        # 使用配置类 config 中指定的批归一化参数 eps 和 momentum

        self.dropout = nn.Dropout(p=drop_rate)
        # 创建一个 dropout 层,使用指定的 dropout rate drop_rate

    def forward(self, embeddings: torch.FloatTensor, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 前向传播函数,输入为 embeddings 和 hidden_states,输出为 torch.Tensor

        hidden_states = self.project_conv(hidden_states)
        # 将 hidden_states 通过之前定义的卷积层 project_conv 进行卷积操作

        hidden_states = self.project_bn(hidden_states)
        # 将卷积后的 hidden_states 通过批归一化层 project_bn 进行批归一化操作

        if self.apply_dropout:
            # 如果 apply_dropout 为 True,则执行以下操作
            hidden_states = self.dropout(hidden_states)
            # 对 hidden_states 应用 dropout 操作
            hidden_states = hidden_states + embeddings
            # 将 dropout 后的 hidden_states 与输入的 embeddings 相加

        return hidden_states
        # 返回处理后的 hidden_states
        ):
        # 调用父类的初始化方法
        super().__init__()
        # 设置扩展比例
        self.expand_ratio = expand_ratio
        # 根据扩展比例确定是否需要扩展操作
        self.expand = True if self.expand_ratio != 1 else False
        # 计算扩展后的输入维度
        expand_in_dim = in_dim * expand_ratio

        # 如果需要扩展,则创建扩展层对象
        if self.expand:
            self.expansion = AlignVisionExpansionLayer(
                config=config, in_dim=in_dim, out_dim=expand_in_dim, stride=stride
            )

        # 创建深度卷积层对象
        self.depthwise_conv = AlignVisionDepthwiseLayer(
            config=config,
            in_dim=expand_in_dim if self.expand else in_dim,
            stride=stride,
            kernel_size=kernel_size,
            adjust_padding=adjust_padding,
        )
        # 创建挤压激活层对象
        self.squeeze_excite = AlignVisionSqueezeExciteLayer(
            config=config, in_dim=in_dim, expand_dim=expand_in_dim, expand=self.expand
        )
        # 创建投影层对象
        self.projection = AlignVisionFinalBlockLayer(
            config=config,
            in_dim=expand_in_dim if self.expand else in_dim,
            out_dim=out_dim,
            stride=stride,
            drop_rate=drop_rate,
            id_skip=id_skip,
        )

    def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
        # 将输入的隐藏状态作为嵌入向量
        embeddings = hidden_states
        # 执行扩展和深度卷积阶段
        if self.expand_ratio != 1:
            hidden_states = self.expansion(hidden_states)
        hidden_states = self.depthwise_conv(hidden_states)

        # 执行挤压激活阶段
        hidden_states = self.squeeze_excite(hidden_states)
        # 执行投影阶段
        hidden_states = self.projection(embeddings, hidden_states)
        # 返回处理后的隐藏状态
        return hidden_states
class AlignVisionEncoder(nn.Module):
    r"""
    Forward propogates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    """

    def __init__(self, config: AlignVisionConfig):
        super().__init__()
        self.depth_coefficient = config.depth_coefficient

        def round_repeats(repeats):
            # Round number of block repeats based on depth multiplier.
            return int(math.ceil(self.depth_coefficient * repeats))

        num_base_blocks = len(config.in_channels)
        num_blocks = sum(round_repeats(n) for n in config.num_block_repeats)

        curr_block_num = 0
        blocks = []
        for i in range(num_base_blocks):
            in_dim = round_filters(config, config.in_channels[i])
            out_dim = round_filters(config, config.out_channels[i])
            stride = config.strides[i]
            kernel_size = config.kernel_sizes[i]
            expand_ratio = config.expand_ratios[i]

            for j in range(round_repeats(config.num_block_repeats[i])):
                id_skip = True if j == 0 else False
                stride = 1 if j > 0 else stride
                in_dim = out_dim if j > 0 else in_dim
                adjust_padding = False if curr_block_num in config.depthwise_padding else True
                drop_rate = config.drop_connect_rate * curr_block_num / num_blocks

                block = AlignVisionBlock(
                    config=config,
                    in_dim=in_dim,
                    out_dim=out_dim,
                    stride=stride,
                    kernel_size=kernel_size,
                    expand_ratio=expand_ratio,
                    drop_rate=drop_rate,
                    id_skip=id_skip,
                    adjust_padding=adjust_padding,
                )
                blocks.append(block)
                curr_block_num += 1

        self.blocks = nn.ModuleList(blocks)

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    ) -> BaseModelOutputWithPoolingAndNoAttention:
        all_hidden_states = (hidden_states,) if output_hidden_states else None

        # Iterate through each block and perform forward pass
        for block in self.blocks:
            hidden_states = block(hidden_states)
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

        # Return output based on return_dict flag
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)

        return BaseModelOutputWithNoAttention(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )


# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->AlignText
class AlignTextEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""
    # 初始化函数,接受一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建词嵌入层,使用 nn.Embedding 类,设置词汇表大小、隐藏大小,并指定填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层,使用 nn.Embedding 类,设置最大位置编码和隐藏大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 创建标记类型嵌入层,使用 nn.Embedding 类,设置标记类型词汇表大小和隐藏大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # 使用 nn.LayerNorm 创建层归一化层,设置隐藏大小和 epsilon 参数
        # self.LayerNorm 的命名方式不使用蛇形命名法,以保持与 TensorFlow 模型变量名的一致性,并能够加载任何 TensorFlow 检查点文件
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 Dropout 层,设置隐藏单元的 dropout 概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        
        # 设置位置嵌入类型,默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册一个缓冲区 tensor,存储从 0 到 config.max_position_embeddings-1 的整数序列,形状为 (1, max_position_embeddings)
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册一个缓冲区 tensor,存储全零的 token_type_ids,形状与 position_ids 相同,数据类型为 long 型
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    # 前向传播函数,接受多个输入参数,输出模型的前向传播结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token 序列的 ID,数据类型为 LongTensor,可选
        token_type_ids: Optional[torch.LongTensor] = None,  # token 类型的 ID,数据类型为 LongTensor,可选
        position_ids: Optional[torch.LongTensor] = None,  # 位置编码的 ID,数据类型为 LongTensor,可选
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 输入的嵌入表示,数据类型为 FloatTensor,可选
        past_key_values_length: int = 0,  # 过去的键值对长度,整数类型,默认为 0
    ) -> torch.Tensor:
        # 如果给定了 input_ids,则获取其形状作为 input_shape
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则,从 inputs_embeds 获取形状,去除最后一个维度
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果 position_ids 为 None,则从 self.position_ids 中获取一部分,匹配当前序列的长度
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 设置 token_type_ids 为构造函数中注册的缓冲区,通常是全零,用于在模型追踪时帮助用户,解决问题 #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 否则,将 token_type_ids 初始化为全零张量,与输入形状匹配
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果 inputs_embeds 为 None,则使用 word_embeddings 对 input_ids 进行嵌入
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        # 根据 token_type_ids 获取 token_type_embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将 inputs_embeds 和 token_type_embeddings 相加作为最终的 embeddings
        embeddings = inputs_embeds + token_type_embeddings

        # 如果使用绝对位置编码,则加上 position_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 对 embeddings 进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        # 对 embeddings 进行 dropout 处理
        embeddings = self.dropout(embeddings)

        # 返回最终的 embeddings
        return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->AlignText
class AlignTextSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # Linear transformation for query, key, and value projections
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # Dropout layer
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # Position embedding type handling
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # Flag indicating if the module is used as a decoder
        self.is_decoder = config.is_decoder

    # Reshape and permute the input tensor for attention scores computation
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ):
        # Forward pass logic will compute attention scores and apply attention mechanisms
        pass

# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->AlignText
class AlignTextSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Fully connected layer for output transformation
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # Layer normalization
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout layer
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # Apply dense layer, dropout, layer normalization, and residual connection
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# 从 transformers.models.bert.modeling_bert.BertAttention 复制并修改为 AlignTextAttention 类
class AlignTextAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化 AlignTextSelfAttention 层
        self.self = AlignTextSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化 AlignTextSelfOutput 层
        self.output = AlignTextSelfOutput(config)
        # 存储被修剪的注意力头部的集合
        self.pruned_heads = set()

    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 找到可修剪的注意力头部和相应索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪后的头部
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self 层的 forward 方法,进行自注意力计算
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将 self 输出传递给 output 层,并结合输入的 hidden_states 计算注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要输出注意力权重,则添加到输出中
        return outputs


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并修改为 AlignTextIntermediate 类
class AlignTextIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义线性层,将隐藏状态映射到中间状态
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 经过线性映射
        hidden_states = self.dense(hidden_states)
        # 应用中间层的激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制并修改为 AlignTextOutput 类
class AlignTextOutput(nn.Module):
    # 初始化函数,用于创建一个新的实例对象
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层,输入维度为config.intermediate_size,输出维度为config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个LayerNorm层,对输入进行归一化,设置epsilon为config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个dropout层,以config.hidden_dropout_prob的概率随机将输入设置为0
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数,处理输入的hidden_states和input_tensor,返回处理后的hidden_states
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层对hidden_states进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对变换后的hidden_states进行dropout操作
        hidden_states = self.dropout(hidden_states)
        # 对dropout后的hidden_states和input_tensor进行残差连接,并进行LayerNorm归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的hidden_states作为最终的输出结果
        return hidden_states
# 从transformers.models.bert.modeling_bert.BertLayer复制的代码,将Bert->AlignText
class AlignTextLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置前馈传递的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度的维度设定为1
        self.seq_len_dim = 1
        # 初始化AlignTextAttention模块
        self.attention = AlignTextAttention(config)
        # 是否作为解码器使用
        self.is_decoder = config.is_decoder
        # 是否添加跨注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加跨注意力,确保作为解码器模型使用
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 使用绝对位置嵌入类型初始化跨注意力模块
            self.crossattention = AlignTextAttention(config, position_embedding_type="absolute")
        # 初始化AlignTextIntermediate模块
        self.intermediate = AlignTextIntermediate(config)
        # 初始化AlignTextOutput模块
        self.output = AlignTextOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention computation using the given inputs and past key/values
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # Retrieve the attention output from the self-attention computation
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # Extract outputs excluding the first (attention_output) and last (present_key_value)
            outputs = self_attention_outputs[1:-1]
            # Retrieve the present key/values from self-attention computation
            present_key_value = self_attention_outputs[-1]
        else:
            # Include self attentions in outputs if we output attention weights
            outputs = self_attention_outputs[1:]

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention computation using the given inputs and past key/values
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # Retrieve the attention output from the cross-attention computation
            attention_output = cross_attention_outputs[0]
            # Add cross attentions to outputs if we output attention weights
            outputs = outputs + cross_attention_outputs[1:-1]

            # Append cross-attn cache to present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # Apply chunking to forward computation for feed forward layer
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # Prepare final outputs including layer output
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        # Return the computed outputs
        return outputs

    def feed_forward_chunk(self, attention_output):
        # Compute intermediate output using the attention output
        intermediate_output = self.intermediate(attention_output)
        # Compute final layer output using intermediate output and attention output
        layer_output = self.output(intermediate_output, attention_output)
        # Return the final layer output
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制代码,并将Bert->AlignText
class AlignTextEncoder(nn.Module):
    # 初始化函数,接受配置参数config
    def __init__(self, config):
        super().__init__()
        # 将配置参数保存在实例中
        self.config = config
        # 创建一个包含多个AlignTextLayer模块的层列表,列表长度为config.num_hidden_layers
        self.layer = nn.ModuleList([AlignTextLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标志默认为False
        self.gradient_checkpointing = False

    # 前向传播函数,接受多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不需要输出隐藏状态,则初始化空元组;否则为 None
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重,则初始化空元组;否则为 None
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出跨层注意力权重或者模型配置未开启跨层注意力,则初始化空元组;否则为 None
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果启用了梯度检查点并且处于训练模式下
        if self.gradient_checkpointing and self.training:
            # 如果 use_cache 为 True,警告并设置为 False
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果 use_cache 为 True,则初始化空元组;否则为 None
        next_decoder_cache = () if use_cache else None
        # 遍历每个解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态,则将当前隐藏状态添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果存在头部掩码,则获取当前层的头部掩码;否则为 None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果存在过去的键值对,则获取当前层的过去键值对;否则为 None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点并且处于训练模式下
            if self.gradient_checkpointing and self.training:
                # 使用梯度检查点函数来计算当前层的输出
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则直接调用当前层的前向传播函数来计算当前层的输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果 use_cache 为 True,则将当前层的缓存状态加入 next_decoder_cache
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重,则将当前层的自注意力权重加入 all_self_attentions
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置开启了跨层注意力,则将当前层的跨层注意力权重加入 all_cross_attentions
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态,则将最终隐藏状态加入 all_hidden_states
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不使用返回字典形式,则返回一个元组,包含需要的输出项
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则返回一个包含所有输出的对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert -> AlignText
# 定义一个池化层的模块,用于处理ALIGN模型的隐藏状态
class AlignTextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层,输入和输出大小都为config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义激活函数为双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 从隐藏状态中取出第一个标记对应的隐藏状态张量
        first_token_tensor = hidden_states[:, 0]
        # 将第一个标记的隐藏状态张量通过全连接层进行线性变换
        pooled_output = self.dense(first_token_tensor)
        # 将线性变换后的结果应用双曲正切激活函数
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出张量
        return pooled_output


class AlignPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """
    
    config_class = AlignConfig
    base_model_prefix = "align"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, (nn.Linear, nn.Conv2d)):
            # 对线性层和卷积层的权重进行正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在偏置项,则将偏置项初始化为零
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, AlignModel):
            # 对AlignModel模块的text_projection部分进行权重初始化(使用Xavier均匀分布)
            nn.init.xavier_uniform_(module.text_projection.weight)
            # 将text_projection的偏置项初始化为零
            module.text_projection.bias.data.zero_()
            # 设置_is_hf_initialized标志为True,表示已经初始化
            module.text_projection._is_hf_initialized = True
        elif isinstance(module, nn.Embedding):
            # 对嵌入层的权重进行正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            # 如果存在padding_idx,则将对应位置的权重初始化为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        if isinstance(module, nn.LayerNorm):
            # 对LayerNorm层的偏置项初始化为零
            module.bias.data.zero_()
            # 对LayerNorm层的权重初始化为全1
            module.weight.data.fill_(1.0)


@add_start_docstrings(
    """The text model from ALIGN without any head or projection on top.""",
    ALIGN_START_DOCSTRING,
)
class AlignTextModel(AlignPreTrainedModel):
    config_class = AlignTextConfig

    def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True):
        super().__init__(config)
        # 初始化AlignTextModel,包括嵌入层和编码器
        self.config = config

        self.embeddings = AlignTextEmbeddings(config)
        self.encoder = AlignTextEncoder(config)

        # 如果需要添加池化层,则初始化池化层
        self.pooler = AlignTextPooler(config) if add_pooling_layer else None

        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回嵌入层的词嵌入
        return self.embeddings.word_embeddings

    def set_input_embeddings(self, value):
        # 设置嵌入层的词嵌入为给定的值
        self.embeddings.word_embeddings = value

    @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=AlignTextConfig)
    # 定义神经网络模型的前向传播方法,用于生成预测结果或特征
    def forward(
        # 输入序列的 token IDs,可选的 Torch 张量
        input_ids: Optional[torch.Tensor] = None,
        # 注意力掩码,指示模型在计算注意力时应忽略的位置,可选的 Torch 张量
        attention_mask: Optional[torch.Tensor] = None,
        # 标识 token 的类型,例如区分两个句子的情况,可选的 Torch 张量
        token_type_ids: Optional[torch.Tensor] = None,
        # 标识 token 在序列中的位置,可选的 Torch 张量
        position_ids: Optional[torch.Tensor] = None,
        # 头部掩码,用于指定哪些注意力头部应该被掩盖,可选的 Torch 张量
        head_mask: Optional[torch.Tensor] = None,
        # 输入的嵌入表示,直接传入而不是使用输入 token IDs 进行嵌入查找,可选的 Torch 张量
        inputs_embeds: Optional[torch.Tensor] = None,
        # 是否返回注意力权重,可选的布尔值
        output_attentions: Optional[bool] = None,
        # 是否返回隐藏状态,可选的布尔值
        output_hidden_states: Optional[bool] = None,
        # 是否以字典形式返回输出结果,可选的布尔值
        return_dict: Optional[bool] = None,
# 使用装饰器添加文档字符串,描述这是一个来自ALIGN模型的视觉模型,不含任何头部或顶部投影
@add_start_docstrings(
    """The vision model from ALIGN without any head or projection on top.""",
    ALIGN_START_DOCSTRING,
)
# 定义AlignVisionModel类,继承自AlignPreTrainedModel类
class AlignVisionModel(AlignPreTrainedModel):
    # 指定配置类为AlignVisionConfig
    config_class = AlignVisionConfig
    # 定义主要输入名称为"pixel_values"
    main_input_name = "pixel_values"
    # 不支持梯度检查点
    supports_gradient_checkpointing = False

    # 初始化方法,接受一个AlignVisionConfig类型的config参数
    def __init__(self, config: AlignVisionConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 将config保存到实例中
        self.config = config
        # 初始化嵌入层对象AlignVisionEmbeddings
        self.embeddings = AlignVisionEmbeddings(config)
        # 初始化编码器对象AlignVisionEncoder
        self.encoder = AlignVisionEncoder(config)

        # 最终的池化层
        if config.pooling_type == "mean":
            # 如果配置中的池化类型为均值池化,则使用AvgPool2d进行初始化
            self.pooler = nn.AvgPool2d(config.hidden_dim, ceil_mode=True)
        elif config.pooling_type == "max":
            # 如果配置中的池化类型为最大池化,则使用MaxPool2d进行初始化
            self.pooler = nn.MaxPool2d(config.hidden_dim, ceil_mode=True)
        else:
            # 如果配置中的池化类型不是'mean'或'max',则抛出数值错误异常
            raise ValueError(f"config.pooling must be one of ['mean', 'max'] got {config.pooling}")

        # 初始化权重并应用最终处理
        self.post_init()

    # 返回视觉模型的输入嵌入层对象
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.convolution

    # 覆盖模型的forward方法,接受pixel_values、output_hidden_states和return_dict作为参数
    @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndNoAttention, config_class=AlignVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果用户未指定 output_hidden_states,则使用模型配置中的默认值

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果用户未指定 return_dict,则使用模型配置中的默认设置

        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")
        # 如果未提供 pixel_values,则抛出数值错误异常

        embedding_output = self.embeddings(pixel_values)
        # 将像素值输入到嵌入层中进行嵌入编码

        encoder_outputs = self.encoder(
            embedding_output,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 使用编码器处理嵌入输出,可选地返回隐藏状态和字典格式输出

        # 应用池化操作
        last_hidden_state = encoder_outputs[0]
        # 取编码器输出的第一个元素作为最终的隐藏状态

        pooled_output = self.pooler(last_hidden_state)
        # 使用池化器处理最终隐藏状态,得到池化输出(通常是 CLS 标记)

        # 重新调整形状 (batch_size, projection_dim, 1 , 1) -> (batch_size, projection_dim)
        pooled_output = pooled_output.reshape(pooled_output.shape[:2])

        if not return_dict:
            # 如果未设置返回字典格式,则返回一个元组
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        return BaseModelOutputWithPoolingAndNoAttention(
            # 如果设置了返回字典格式,则返回包含所有信息的特定输出对象
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
# 定义 AlignModel 类,继承自 AlignPreTrainedModel
@add_start_docstrings(ALIGN_START_DOCSTRING)
class AlignModel(AlignPreTrainedModel):
    # 指定配置类为 AlignConfig
    config_class = AlignConfig

    # 初始化函数,接受一个 AlignConfig 类型的 config 对象作为参数
    def __init__(self, config: AlignConfig):
        # 调用父类的初始化方法
        super().__init__(config)

        # 检查 config.text_config 是否为 AlignTextConfig 类型,否则抛出 ValueError 异常
        if not isinstance(config.text_config, AlignTextConfig):
            raise ValueError(
                "config.text_config is expected to be of type AlignTextConfig but is of type"
                f" {type(config.text_config)}."
            )

        # 检查 config.vision_config 是否为 AlignVisionConfig 类型,否则抛出 ValueError 异常
        if not isinstance(config.vision_config, AlignVisionConfig):
            raise ValueError(
                "config.vision_config is expected to be of type AlignVisionConfig but is of type"
                f" {type(config.vision_config)}."
            )

        # 将 text_config 和 vision_config 存储在局部变量中
        text_config = config.text_config
        vision_config = config.vision_config

        # 存储配置中的 projection_dim 和 text_embed_dim 到当前对象的属性中
        self.projection_dim = config.projection_dim
        self.text_embed_dim = text_config.hidden_size

        # 创建 AlignTextModel 和 AlignVisionModel 的实例,分别使用 text_config 和 vision_config 作为参数
        self.text_model = AlignTextModel(text_config)
        self.vision_model = AlignVisionModel(vision_config)

        # 创建一个线性层,用于文本嵌入维度到投影维度的转换
        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim)
        
        # 创建一个可学习的参数 temperature,并使用 config 中的 temperature_init_value 进行初始化
        self.temperature = nn.Parameter(torch.tensor(self.config.temperature_init_value))

        # 调用 post_init 方法,用于初始化权重和应用最终处理
        self.post_init()

    # 在模型前向传播时,添加文档字符串说明,描述输入参数的作用
    @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING)
    def get_text_features(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AlignTextModel`].

        Examples:

        ```
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input parameters to the text_model of ALIGN model and retrieve outputs
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the pooled output (first element) from text_model outputs
        last_hidden_state = text_outputs[0][:, 0, :]
        # Apply text projection layer to obtain text features
        text_features = self.text_projection(last_hidden_state)

        # Return the computed text features
        return text_features
        ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AlignVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components.
        # 设置是否返回隐藏状态,默认为 ALIGN 模型配置中的设定
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置是否返回字典格式的输出,默认为 ALIGN 模型配置中的设定
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用视觉模型来获取视觉输出
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从视觉输出中取出汇总输出作为图像特征
        image_features = vision_outputs[1]  # pooled_output

        # 返回图像特征
        return image_features

    @add_start_docstrings_to_model_forward(ALIGN_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=AlignOutput, config_class=AlignConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

.\models\align\processing_align.py

"""
Image/Text processor class for ALIGN
"""

# 导入必要的模块和类
from ...processing_utils import ProcessorMixin  # 导入处理工具混合类
from ...tokenization_utils_base import BatchEncoding  # 导入批量编码类


class AlignProcessor(ProcessorMixin):
    r"""
    Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and
    tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
    information.

    Args:
        image_processor ([`EfficientNetImageProcessor`]):
            The image processor is a required input.
        tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
            The tokenizer is a required input.
    """

    # 定义类属性
    attributes = ["image_processor", "tokenizer"]  # 类属性列表,包含图像处理器和分词器
    image_processor_class = "EfficientNetImageProcessor"  # 图像处理器类名
    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")  # 分词器类名(元组形式)

    def __init__(self, image_processor, tokenizer):
        """
        Initialize the AlignProcessor with image_processor and tokenizer.

        Args:
            image_processor: Instance of EfficientNetImageProcessor.
            tokenizer: Instance of BertTokenizer or BertTokenizerFast.
        """
        super().__init__(image_processor, tokenizer)

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
        Please refer to the docstring of this method for more information.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            List[str]: Decoded texts.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`].
        Please refer to the docstring of this method for more information.

        Args:
            *args: Variable length argument list.
            **kwargs: Arbitrary keyword arguments.

        Returns:
            str: Decoded text.
        """
        return self.tokenizer.decode(*args, **kwargs)

    @property
    def model_input_names(self):
        """
        Property that combines model input names from both tokenizer and image processor.

        Returns:
            list: List of unique model input names.
        """
        tokenizer_input_names = self.tokenizer.model_input_names
        image_processor_input_names = self.image_processor.model_input_names
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

.\models\align\__init__.py

# 版权声明和版权信息
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块
from typing import TYPE_CHECKING

# 从utils中导入必要的异常和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_align": [
        "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "AlignConfig",
        "AlignTextConfig",
        "AlignVisionConfig",
    ],
    "processing_align": ["AlignProcessor"],
}

# 检查是否Torch可用,如果不可用则引发OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果Torch可用,则添加以下模型相关的导入结构
    _import_structure["modeling_align"] = [
        "ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST",
        "AlignModel",
        "AlignPreTrainedModel",
        "AlignTextModel",
        "AlignVisionModel",
    ]

# 如果是类型检查阶段,导入特定的配置和模型类
if TYPE_CHECKING:
    from .configuration_align import (
        ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP,
        AlignConfig,
        AlignTextConfig,
        AlignVisionConfig,
    )
    from .processing_align import AlignProcessor

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_align import (
            ALIGN_PRETRAINED_MODEL_ARCHIVE_LIST,
            AlignModel,
            AlignPreTrainedModel,
            AlignTextModel,
            AlignVisionModel,
        )

# 如果不是类型检查阶段,则进行Lazy导入的设置
else:
    import sys

    # 将当前模块替换为LazyModule,使用Lazy加载模块
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\altclip\configuration_altclip.py

# coding=utf-8
# Copyright 2022 WenXiang ZhongzhiCheng LedellWu LiuGuang BoWenZhang and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
AltCLIP model configuration
"""
import os  # 导入标准库中的os模块,用于与操作系统交互
from typing import Union  # 从typing模块导入Union类型,用于类型提示

from ...configuration_utils import PretrainedConfig  # 从本地的配置工具中导入预训练配置类
from ...utils import logging  # 从本地的工具模块中导入日志记录工具

logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

# 定义AltCLIP预训练模型配置文件的下载映射字典
ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "BAAI/AltCLIP": "https://huggingface.co/BAAI/AltCLIP/resolve/main/config.json",
    # 查看所有AltCLIP模型的下载地址:https://huggingface.co/models?filter=altclip
}


class AltCLIPTextConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`AltCLIPTextModel`]. It is used to instantiate a
    AltCLIP text model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the AltCLIP
    [BAAI/AltCLIP](https://huggingface.co/BAAI/AltCLIP) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Examples:

    ```
    >>> from transformers import AltCLIPTextModel, AltCLIPTextConfig

    >>> # Initializing a AltCLIPTextConfig with BAAI/AltCLIP style configuration
    >>> configuration = AltCLIPTextConfig()

    >>> # Initializing a AltCLIPTextModel (with random weights) from the BAAI/AltCLIP style configuration
    >>> model = AltCLIPTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "altclip_text_model"

    def __init__(
        self,
        vocab_size=250002,
        hidden_size=1024,
        num_hidden_layers=24,
        num_attention_heads=16,
        intermediate_size=4096,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=514,
        type_vocab_size=1,
        initializer_range=0.02,
        initializer_factor=0.02,
        layer_norm_eps=1e-05,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
        position_embedding_type="absolute",
        use_cache=True,
        project_dim=768,
        **kwargs,
        # 调用父类的初始化方法,设置模型参数,包括特殊的 token ID 和其他关键字参数
        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)

        # 初始化模型的词汇表大小
        self.vocab_size = vocab_size
        # 初始化模型的隐藏层大小
        self.hidden_size = hidden_size
        # 初始化模型的隐藏层数量
        self.num_hidden_layers = num_hidden_layers
        # 初始化模型的注意力头数量
        self.num_attention_heads = num_attention_heads
        # 初始化模型的隐藏层激活函数
        self.hidden_act = hidden_act
        # 初始化模型的中间层大小
        self.intermediate_size = intermediate_size
        # 初始化模型的隐藏层 dropout 概率
        self.hidden_dropout_prob = hidden_dropout_prob
        # 初始化模型的注意力机制 dropout 概率
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        # 初始化模型的最大位置嵌入长度
        self.max_position_embeddings = max_position_embeddings
        # 初始化模型的类型词汇表大小
        self.type_vocab_size = type_vocab_size
        # 初始化模型的参数初始化范围
        self.initializer_range = initializer_range
        # 初始化模型的初始化因子
        self.initializer_factor = initializer_factor
        # 初始化模型的层归一化 epsilon
        self.layer_norm_eps = layer_norm_eps
        # 初始化模型的位置嵌入类型
        self.position_embedding_type = position_embedding_type
        # 初始化模型是否使用缓存
        self.use_cache = use_cache
        # 初始化模型的投影维度
        self.project_dim = project_dim
# 定义 AltCLIPVisionConfig 类,继承自 PretrainedConfig 类,用于存储 AltCLIPModel 的配置信息
class AltCLIPVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`AltCLIPModel`]. It is used to instantiate an
    AltCLIP model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the AltCLIP
    [BAAI/AltCLIP](https://huggingface.co/BAAI/AltCLIP) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 512):
            Dimentionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```
    >>> from transformers import AltCLIPVisionConfig, AltCLIPVisionModel

    >>> # Initializing a AltCLIPVisionConfig with BAAI/AltCLIP style configuration
    >>> configuration = AltCLIPVisionConfig()

    >>> # Initializing a AltCLIPVisionModel (with random weights) from the BAAI/AltCLIP style configuration
    >>> model = AltCLIPVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    # 将 model 对象的配置信息存储在 configuration 变量中
    configuration = model.config

    model_type = "altclip_vision_model"

    def __init__(
        self,
        hidden_size=768,
        intermediate_size=3072,
        projection_dim=512,
        num_hidden_layers=12,
        num_attention_heads=12,
        num_channels=3,
        image_size=224,
        patch_size=32,
        hidden_act="quick_gelu",
        layer_norm_eps=1e-5,
        attention_dropout=0.0,
        initializer_range=0.02,
        initializer_factor=1.0,
        **kwargs,
    ):
        # 调用父类的构造函数,初始化对象
        super().__init__(**kwargs)

        # 设置模型的各种参数
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.projection_dim = projection_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_channels = num_channels
        self.patch_size = patch_size
        self.image_size = image_size
        self.initializer_range = initializer_range
        self.initializer_factor = initializer_factor
        self.attention_dropout = attention_dropout
        self.layer_norm_eps = layer_norm_eps
        self.hidden_act = hidden_act

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
        # 在 kwargs 中设置 token
        cls._set_token_in_kwargs(kwargs)

        # 获取预训练模型的配置字典和额外的 kwargs
        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

        # 如果从 AltCLIPConfig 加载,则获取视觉配置字典
        if config_dict.get("model_type") == "altclip":
            config_dict = config_dict["vision_config"]

        # 检查模型类型是否匹配,并生成警告
        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
            logger.warning(
                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
            )

        # 从配置字典创建模型实例
        return cls.from_dict(config_dict, **kwargs)
class AltCLIPConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`AltCLIPModel`]. It is used to instantiate an
    AltCLIP model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the AltCLIP
    [BAAI/AltCLIP](https://huggingface.co/BAAI/AltCLIP) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`AltCLIPTextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`AltCLIPVisionConfig`].
        projection_dim (`int`, *optional*, defaults to 768):
            Dimentionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import AltCLIPConfig, AltCLIPModel

    >>> # Initializing a AltCLIPConfig with BAAI/AltCLIP style configuration
    >>> configuration = AltCLIPConfig()

    >>> # Initializing a AltCLIPModel (with random weights) from the BAAI/AltCLIP style configuration
    >>> model = AltCLIPModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a AltCLIPConfig from a AltCLIPTextConfig and a AltCLIPVisionConfig

    >>> # Initializing a AltCLIPText and AltCLIPVision configuration
    >>> config_text = AltCLIPTextConfig()
    >>> config_vision = AltCLIPVisionConfig()

    >>> config = AltCLIPConfig.from_text_vision_configs(config_text, config_vision)
    ```"""

    model_type = "altclip"

    def __init__(
        self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs
    ):
        # 初始化方法,用于设置 AltCLIPConfig 对象的属性
        super().__init__(**kwargs)
        # 设置 text_config 属性,用于存储文本配置的字典
        self.text_config = text_config
        # 设置 vision_config 属性,用于存储视觉配置的字典
        self.vision_config = vision_config
        # 设置 projection_dim 属性,用于存储投影层的维度,默认为 768
        self.projection_dim = projection_dim
        # 设置 logit_scale_init_value 属性,用于存储 logit_scale 参数的初始值,默认为 2.6592
        self.logit_scale_init_value = logit_scale_init_value

    @classmethod
    def from_text_vision_configs(cls, text_config: AltCLIPTextConfig, vision_config: AltCLIPVisionConfig, **kwargs):
        r"""
        Instantiate a [`AltCLIPConfig`] (or a derived class) from altclip text model configuration and altclip vision
        model configuration.

        Returns:
            [`AltCLIPConfig`]: An instance of a configuration object
        """
        # 从给定的 AltCLIPTextConfig 和 AltCLIPVisionConfig 对象实例化一个 AltCLIPConfig 对象
        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)

.\models\altclip\modeling_altclip.py

# 导入 math 模块,用于数学运算
import math
# 导入 dataclass 用于创建数据类,用于存储数据而无需手动编写常规方法
from dataclasses import dataclass
# 导入类型提示相关的模块
from typing import Any, List, Optional, Tuple, Union

# 导入 PyTorch 相关模块
import torch
import torch.nn as nn
import torch.utils.checkpoint

# 导入自定义的激活函数映射表
from ...activations import ACT2FN
# 导入模型输出相关的类
from ...modeling_outputs import (
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPooling,
    BaseModelOutputWithPoolingAndCrossAttentions,
    BaseModelOutputWithPoolingAndProjection,
)
# 导入预训练模型的基类
from ...modeling_utils import PreTrainedModel
# 导入 PyTorch 工具函数,用于分块处理前向传播
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
# 导入辅助工具函数
from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
# 导入 AltCLIP 相关的配置类
from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预定义的检查点和配置文档
_CHECKPOINT_FOR_DOC = "BAAI/AltCLIP"
_CONFIG_FOR_DOC = "AltCLIPConfig"

# 预训练模型存档列表
ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "BAAI/AltCLIP",
    # 可在 https://huggingface.co/models?filter=altclip 查看所有 AltCLIP 模型
]

# AltCLIP 模型的开始文档字符串,描述了模型继承自 PreTrainedModel,以及模型参数配置等
ALTCLIP_START_DOCSTRING = r"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

# AltCLIP 文本输入的文档字符串,用于描述模型接收的文本输入格式
ALTCLIP_TEXT_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            # 输入序列的token索引,形状为(batch_size, sequence_length),通过AutoTokenizer获取。
            # 参见PreTrainedTokenizer.encode和PreTrainedTokenizer.__call__获取详情。
            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 避免在填充token索引上执行注意力的掩码。掩码值在[0, 1]之间:
            # - 1表示**未被掩码**的token,
            # - 0表示**被掩码**的token。
            [What are attention masks?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 输入序列token在位置嵌入中的位置索引。选择范围为[0, config.max_position_embeddings - 1]。
            [What are position IDs?](../glossary#position-ids)
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。返回的张量中的attentions字段包含更多细节。
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。返回的张量中的hidden_states字段包含更多细节。
        return_dict (`bool`, *optional*):
            # 是否返回utils.ModelOutput而不是普通元组。
# 定义一个原始字符串文档,描述了输入参数的详细说明和类型
ALTCLIP_VISION_INPUTS_DOCSTRING = r"""
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            像素数值。默认情况下将忽略填充。可以使用 [`AutoImageProcessor`] 获取像素值。
            查看 [`CLIPImageProcessor.__call__`] 获取更多详情。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。返回的张量中的 `attentions` 更多细节。
        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。返回的张量中的 `hidden_states` 更多细节。
        return_dict (`bool`, *optional*):
            是否返回一个 [`~utils.ModelOutput`] 而不是普通元组。
"""

# 定义一个原始字符串文档,描述了输入参数的详细说明和类型
ALTCLIP_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            输入序列标记的索引。默认情况下将忽略填充。
            可以使用 [`AutoTokenizer`] 获取这些索引。
            查看 [`PreTrainedTokenizer.encode`] 和 [`PreTrainedTokenizer.__call__`] 获取更多详情。
            [什么是输入 IDs?](../glossary#input-ids)
        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            避免在填充的令牌索引上执行注意力的掩码。掩码值在 `[0, 1]` 中选择:

            - 1 表示 **未掩码** 的令牌,
            - 0 表示 **掩码** 的令牌。

            [什么是注意力掩码?](../glossary#attention-mask)
        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            每个输入序列令牌在位置嵌入中的位置索引。选在范围 `[0, config.max_position_embeddings - 1]` 内。

            [什么是位置 ID?](../glossary#position-ids)
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            像素数值。默认情况下将忽略填充。
            可以使用 [`AutoImageProcessor`] 获取像素值。
            查看 [`CLIPImageProcessor.__call__`] 获取更多详情。
        return_loss (`bool`, *optional*):
            是否返回对比损失。
        output_attentions (`bool`, *optional*):
            是否返回所有注意力层的注意力张量。返回的张量中的 `attentions` 更多细节。
        output_hidden_states (`bool`, *optional*):
            是否返回所有层的隐藏状态。返回的张量中的 `hidden_states` 更多细节。
        return_dict (`bool`, *optional*):
            是否返回一个 [`~utils.ModelOutput`] 而不是普通元组。
"""
# 对比损失函数,从 https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html 改编而来
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
    # 使用交叉熵损失计算对比损失,目标标签为 logits 的长度
    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))


def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
    # 计算对比损失的加权平均值
    caption_loss = contrastive_loss(similarity)
    # 对转置的相似性张量计算对比损失
    image_loss = contrastive_loss(similarity.t())
    return (caption_loss + image_loss) / 2.0


@dataclass
# 从 transformers.models.clip.modeling_clip.CLIPOutput 复制并更改为 AltCLIP
class AltCLIPOutput(ModelOutput):
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            图像与文本相似性的对比损失。
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            `image_embeds` 和 `text_embeds` 之间的缩放点积分数。表示图像与文本的相似性分数。
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            `text_embeds` 和 `image_embeds` 之间的缩放点积分数。表示文本与图像的相似性分数。
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            通过对 [`AltCLIPTextModel`] 的汇总输出应用投影层获得的文本嵌入。
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            通过对 [`AltCLIPVisionModel`] 的汇总输出应用投影层获得的图像嵌入。
        text_model_output(`BaseModelOutputWithPooling`):
            [`AltCLIPTextModel`] 的输出。
        vision_model_output(`BaseModelOutputWithPooling`):
            [`AltCLIPVisionModel`] 的输出。
    """

    loss: Optional[torch.FloatTensor] = None
    logits_per_image: torch.FloatTensor = None
    logits_per_text: torch.FloatTensor = None
    text_embeds: torch.FloatTensor = None
    image_embeds: torch.FloatTensor = None
    text_model_output: BaseModelOutputWithPooling = None
    vision_model_output: BaseModelOutputWithPooling = None

    def to_tuple(self) -> Tuple[Any]:
        # 将对象转换为元组形式,但排除 `text_model_output` 和 `vision_model_output` 属性
        return tuple(
            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
            for k in self.keys()
        )


# 从 transformers.models.roberta.modeling_roberta.RobertaEmbeddings 复制并更改为 AltRoberta
class AltRobertaEmbeddings(nn.Module):
    """
    与 BertEmbeddings 相同,但是对位置嵌入的索引进行了微小的调整。
    """

    # 从 transformers.models.bert.modeling_bert.BertEmbeddings.__init__ 复制
    # 初始化函数,用于初始化一个模型实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建词嵌入层,将词汇表大小、隐藏层大小作为参数,指定填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 创建位置嵌入层,将最大位置编码数和隐藏层大小作为参数
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 创建标记类型嵌入层,将标记类型词汇表大小和隐藏层大小作为参数
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        # self.LayerNorm 不使用蛇形命名以保持与 TensorFlow 模型变量名的一致性,并能够加载任何 TensorFlow 检查点文件
        # 创建层归一化层,将隐藏层大小和层归一化的 epsilon 参数作为参数
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建 dropout 层,将隐藏层的 dropout 概率作为参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # position_embedding_type 指定位置嵌入类型,默认为绝对位置编码
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册位置 id 缓冲区,用于存储从 0 到 config.max_position_embeddings - 1 的序列
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册标记类型 id 缓冲区,用零填充,与位置 id 的大小相同,数据类型为长整型
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

        # End copy
        # 设置填充标记的索引,与 config.pad_token_id 相同
        self.padding_idx = config.pad_token_id
        # 重新创建位置嵌入层,将最大位置编码数、隐藏层大小以及填充标记的索引作为参数
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
        )

    # 前向传播函数,接受输入参数并进行模型前向传播计算
    def forward(
        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
        ):
            # 如果没有给定位置标识符,则根据输入的标记标识符创建位置标识符。任何填充的标记保持填充状态。
            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
        else:
            # 如果没有输入标记标识符,则从输入嵌入创建位置标识符
            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)

        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            input_shape = inputs_embeds.size()[:-1]

        seq_length = input_shape[1]

        # 将token_type_ids设置为构造函数中注册的缓冲区,该缓冲区全为零,通常在自动生成时出现,
        # 注册的缓冲区在不传递token_type_ids时帮助用户跟踪模型,解决问题#5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                # 获取已注册的缓冲区中的token_type_ids,并扩展以匹配输入形状
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                # 如果模型未定义token_type_ids,则创建全零的tensor
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        if inputs_embeds is None:
            # 如果没有输入嵌入,则从输入标记标识符创建嵌入
            inputs_embeds = self.word_embeddings(input_ids)
        # 根据token_type_ids获取token类型嵌入
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 计算最终的嵌入表示,包括输入嵌入、token类型嵌入和位置嵌入(如果是绝对位置编码)
        embeddings = inputs_embeds + token_type_embeddings
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings
        # 应用LayerNorm层进行归一化处理
        embeddings = self.LayerNorm(embeddings)
        # 应用dropout进行正则化
        embeddings = self.dropout(embeddings)
        return embeddings

    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
        """
        提供的是直接的嵌入。我们无法推断哪些是填充的,因此只生成顺序位置标识符。

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        """
        input_shape = inputs_embeds.size()[:-1]
        sequence_length = input_shape[1]

        # 生成顺序位置标识符,从padding_idx + 1到sequence_length + padding_idx + 1
        position_ids = torch.arange(
            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
        )
        return position_ids.unsqueeze(0).expand(input_shape)
# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->AltRoberta
class AltRobertaSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏大小是否能够被注意力头数整除,如果不是则抛出错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        # 设置注意力头数和每个头的大小
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 线性变换层,用于生成查询、键、值
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # Dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        
        # 位置嵌入类型,默认为绝对位置编码
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果是相对位置编码,则需要额外的距离嵌入
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否为解码器(Transformer 架构中的一部分)
        self.is_decoder = config.is_decoder

    # 将线性变换后的张量重新组织为注意力分数的张量形状
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ):
        # 这里没有直接写 forward 函数,而是展示了函数签名及参数说明
        pass

# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput
class AltRobertaSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 全连接层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # LayerNorm 层
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # Dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 线性变换
        hidden_states = self.dense(hidden_states)
        # Dropout
        hidden_states = self.dropout(hidden_states)
        # LayerNorm
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
# 从 transformers.models.roberta.modeling_roberta.RobertaAttention 复制并修改为 AltRobertaAttention 类
class AltRobertaAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 初始化 self 层,用于自注意力机制
        self.self = AltRobertaSelfAttention(config, position_embedding_type=position_embedding_type)
        # 初始化 output 层,用于处理自注意力输出
        self.output = AltRobertaSelfOutput(config)
        # 初始化一个空集合,用于存储被修剪的注意力头信息
        self.pruned_heads = set()

    # 修剪不需要的注意力头
    def prune_heads(self, heads):
        if len(heads) == 0:
            return
        # 查找需要修剪的注意力头和其索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 修剪线性层
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储修剪过的注意力头信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 使用 self 层进行自注意力计算
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 使用 output 层处理自注意力的输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 如果需要输出注意力信息,则添加到 outputs 中
        outputs = (attention_output,) + self_outputs[1:]  # 如果需要,还可以添加注意力
        return outputs


# 从 transformers.models.roberta.modeling_roberta.RobertaIntermediate 复制并修改为 AltRobertaIntermediate 类
class AltRobertaIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 线性层,将隐藏状态映射到中间状态
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 中间激活函数根据配置选择
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 线性映射
        hidden_states = self.dense(hidden_states)
        # 应用中间激活函数
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states


# 从 transformers.models.roberta.modeling_roberta.RobertaOutput 复制为 AltRobertaOutput 类
class AltRobertaOutput(nn.Module):
    # 初始化方法,接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层,输入大小为中间大小,输出大小为隐藏大小
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个层归一化层,对隐藏状态进行归一化处理,设置 epsilon 为配置中的层归一化 epsilon
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 dropout 层,以配置中的隐藏 dropout 概率为参数
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法,接受两个张量作为输入,返回一个张量作为输出
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 使用全连接层对隐藏状态进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对线性变换后的隐藏状态进行 dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 对 dropout 后的隐藏状态与输入张量进行残差连接,并进行层归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的隐藏状态张量
        return hidden_states
# 从transformers.models.roberta.modeling_roberta.RobertaLayer复制代码,修改为AltRoberta
class AltRobertaLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置前向传播中的块大小
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设为1
        self.seq_len_dim = 1
        # 初始化注意力层,使用AltRobertaAttention
        self.attention = AltRobertaAttention(config)
        # 是否作为解码器使用
        self.is_decoder = config.is_decoder
        # 是否添加交叉注意力
        self.add_cross_attention = config.add_cross_attention
        # 如果添加交叉注意力但不作为解码器使用,抛出异常
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化交叉注意力层,使用AltRobertaAttention,位置嵌入类型为"absolute"
            self.crossattention = AltRobertaAttention(config, position_embedding_type="absolute")
        # 初始化中间层,使用AltRobertaIntermediate
        self.intermediate = AltRobertaIntermediate(config)
        # 初始化输出层,使用AltRobertaOutput
        self.output = AltRobertaOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
        ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # Perform self-attention on the input hidden_states using the attention module
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # Retrieve the attention output from self_attention_outputs
        attention_output = self_attention_outputs[0]

        # if decoder, the last output is tuple of self-attn cache
        if self.is_decoder:
            # Exclude the first and last elements from self_attention_outputs
            outputs = self_attention_outputs[1:-1]
            # Retrieve the present key/value tuple from self_attention_outputs
            present_key_value = self_attention_outputs[-1]
        else:
            # Include all elements except the first one from self_attention_outputs
            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights

        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # Perform cross-attention using crossattention module
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            # Retrieve the attention output from cross_attention_outputs
            attention_output = cross_attention_outputs[0]
            # Combine outputs with cross attentions from cross_attention_outputs
            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights

            # add cross-attn cache to positions 3,4 of present_key_value tuple
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # Apply chunking to the forward pass of feed_forward_chunk method
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        # Include layer_output in the outputs tuple
        outputs = (layer_output,) + outputs

        # if decoder, return the attn key/values as the last output
        if self.is_decoder:
            # Append present_key_value to outputs tuple
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # Pass attention_output through intermediate and output layers
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从 transformers.models.roberta.modeling_roberta.RobertaEncoder 复制并修改为 AltRoberta
class AltRobertaEncoder(nn.Module):
    # 初始化函数,设置模型的配置和层列表
    def __init__(self, config):
        super().__init__()
        # 保存模型配置
        self.config = config
        # 创建包含多个 AltRobertaLayer 层的列表,层数由配置决定
        self.layer = nn.ModuleList([AltRobertaLayer(config) for _ in range(config.num_hidden_layers)])
        # 梯度检查点标志,默认为 False
        self.gradient_checkpointing = False

    # 前向传播函数,接收多个输入和控制参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
    # 返回值类型注释,指定了函数返回的对象类型
    ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不需要输出隐藏状态,则初始化为空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重,则初始化为空元组
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出交叉注意力权重或者模型配置中不包含交叉注意力,则初始化为空元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果启用了梯度检查点并且在训练阶段,则检查是否允许使用缓存;若允许则发出警告并设置不使用缓存
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果使用缓存,则初始化下一个解码器缓存为空元组;否则设置为None
        next_decoder_cache = () if use_cache else None
        # 遍历每个解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态,则将当前隐藏状态添加到所有隐藏状态元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 如果有头部掩码,则获取当前层的头部掩码;否则为None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 如果有过去的键值对,则获取当前层的过去键值对;否则为None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点并且在训练阶段,则调用梯度检查点函数进行前向传播
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则,直接调用当前层的前向传播函数
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新当前隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存,则将当前层的输出的最后一个元素添加到下一个解码器缓存中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重,则将当前层的输出的第二个元素添加到所有自注意力权重元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置中包含交叉注意力,则将当前层的输出的第三个元素添加到所有交叉注意力权重元组中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态,则将当前隐藏状态添加到所有隐藏状态元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果,则返回一个元组,包含所有需要输出的元素,且过滤掉空值
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则,返回一个包含所有需要输出的对象的BaseModelOutputWithPastAndCrossAttentions对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler
class AltRobertaPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层,输入和输出维度都是 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 定义一个激活函数,使用双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 从隐藏状态中选择第一个 token 的隐藏状态作为池化输出
        first_token_tensor = hidden_states[:, 0]
        # 将选定的第一个 token 的隐藏状态传递给全连接层
        pooled_output = self.dense(first_token_tensor)
        # 使用激活函数处理全连接层的输出
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出
        return pooled_output


# Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->AltCLIP
class AltCLIPAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.head_dim = self.embed_dim // self.num_heads
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: "
                f"{self.num_heads})."
            )
        self.scale = self.head_dim**-0.5
        self.dropout = config.attention_dropout

        # 定义用于查询(Q)、键(K)、值(V)投影的线性层
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
        # 定义输出投影层
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        # 对输入张量进行形状变换,以便进行多头注意力计算
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = False,
        # ...
    ):
        # 省略了 forward 方法中的其余代码,该方法实现了多头注意力的前向传播


# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->AltCLIP
class AltCLIPMLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # 激活函数
        self.activation_fn = ACT2FN[config.hidden_act]
        # 第一个全连接层,输入维度为 config.hidden_size,输出维度为 config.intermediate_size
        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # 第二个全连接层,输入维度为 config.intermediate_size,输出维度为 config.hidden_size
        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 第一个全连接层的前向传播
        hidden_states = self.fc1(hidden_states)
        # 激活函数的应用
        hidden_states = self.activation_fn(hidden_states)
        # 第二个全连接层的前向传播
        hidden_states = self.fc2(hidden_states)
        # 返回最终的隐藏状态张量
        return hidden_states


# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->AltCLIP
class AltCLIPEncoderLayer(nn.Module):
    # 这个类定义了 CLIP 模型的编码器层,实现了自注意力机制和前馈神经网络
    # 具体的实现方法将包含在后续的方法中
    pass
    # 初始化函数,接收一个 AltCLIPConfig 类型的配置对象作为参数
    def __init__(self, config: AltCLIPConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 设置自注意力模块的隐藏单元数作为嵌入维度
        self.embed_dim = config.hidden_size
        # 创建自注意力层对象
        self.self_attn = AltCLIPAttention(config)
        # 第一个 LayerNorm 层,对隐藏状态进行归一化
        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
        # 创建多层感知机 MLP 模块
        self.mlp = AltCLIPMLP(config)
        # 第二个 LayerNorm 层,对隐藏状态进行归一化
        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

    # 前向传播函数,接收多个张量作为输入,并返回一个元组的张量
    def forward(
        self,
        hidden_states: torch.Tensor,  # 输入的隐藏状态张量,形状为(batch, seq_len, embed_dim)
        attention_mask: torch.Tensor,  # 注意力掩码张量,形状为(batch, 1, tgt_len, src_len)
        causal_attention_mask: torch.Tensor,  # 因果注意力掩码张量,形状同上
        output_attentions: Optional[bool] = False,  # 是否输出所有层的注意力张量,默认为False
    ) -> Tuple[torch.FloatTensor]:  # 返回一个包含浮点数张量的元组
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers.
        """
        # 保存原始的隐藏状态张量作为残差连接的一部分
        residual = hidden_states

        # 对隐藏状态进行第一次 LayerNorm 归一化
        hidden_states = self.layer_norm1(hidden_states)
        # 使用自注意力层进行注意力计算,并返回计算的注意力权重
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            causal_attention_mask=causal_attention_mask,
            output_attentions=output_attentions,
        )
        # 将残差连接加回到当前隐藏状态中
        hidden_states = residual + hidden_states

        # 保存更新后的隐藏状态作为残差连接的一部分
        residual = hidden_states
        # 对隐藏状态进行第二次 LayerNorm 归一化
        hidden_states = self.layer_norm2(hidden_states)
        # 使用 MLP 模块进行非线性变换
        hidden_states = self.mlp(hidden_states)
        # 将残差连接加回到当前隐藏状态中
        hidden_states = residual + hidden_states

        # 准备要输出的结果,只包含更新后的隐藏状态张量
        outputs = (hidden_states,)

        # 如果需要输出注意力权重,则将注意力权重张量加入到输出结果中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回最终的输出结果元组
        return outputs
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->AltCLIP
class AltCLIPEncoder(nn.Module):
    """
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    """

    def __init__(self, config: AltCLIPConfig):
        super().__init__()
        self.config = config
        # 创建包含多个 AltCLIPEncoderLayer 层的模块列表
        self.layers = nn.ModuleList([AltCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

    def forward(
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
        causal_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 此处是 AltCLIPEncoder 的前向传播函数,输入参数包括嵌入输入、注意力掩码等
        pass  # 此处省略了具体的前向传播逻辑,实际应根据具体情况填写

# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->AltCLIP
class AltCLIPVisionEmbeddings(nn.Module):
    def __init__(self, config: AltCLIPVisionConfig):
        super().__init__()
        self.config = config
        self.embed_dim = config.hidden_size
        self.image_size = config.image_size
        self.patch_size = config.patch_size

        # 定义分类嵌入向量作为模块的可学习参数
        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))

        # 使用卷积层创建图像块嵌入向量,参数包括输入通道数、输出通道数等
        self.patch_embedding = nn.Conv2d(
            in_channels=config.num_channels,
            out_channels=self.embed_dim,
            kernel_size=self.patch_size,
            stride=self.patch_size,
            bias=False,
        )

        # 计算图像中的图块数量和位置数量,并定义位置嵌入层
        self.num_patches = (self.image_size // self.patch_size) ** 2
        self.num_positions = self.num_patches + 1
        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)

    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
        batch_size = pixel_values.shape[0]
        target_dtype = self.patch_embedding.weight.dtype
        # 对输入的像素值进行图块嵌入,然后展平和转置操作
        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)

        # 扩展分类嵌入向量,与图块嵌入连接,并加上位置嵌入
        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
        embeddings = embeddings + self.position_embedding(self.position_ids)
        return embeddings


class AltCLIPPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定使用的配置类和模型前缀
    config_class = AltCLIPConfig
    base_model_prefix = "altclip"
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights"""
        # 从配置中获取初始化因子
        factor = self.config.initializer_factor
        
        # 如果模块是 AltCLIPVisionEmbeddings 类型
        if isinstance(module, AltCLIPVisionEmbeddings):
            # 重新设置初始化因子
            factor = self.config.initializer_factor
            
            # 初始化 class_embedding 使用均值为 0,标准差为 embed_dim 的倒数乘以 factor
            nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
            
            # 初始化 patch_embedding 的权重,标准差为 initializer_range 乘以 factor
            nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
            
            # 初始化 position_embedding 的权重,标准差为 initializer_range 乘以 factor
            nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
        
        # 如果模块是 AltCLIPAttention 类型
        elif isinstance(module, AltCLIPAttention):
            # 重新设置初始化因子
            factor = self.config.initializer_factor
            
            # 初始化 q_proj 的权重,标准差为 in_proj_std
            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
            
            # 初始化 k_proj 的权重,标准差为 in_proj_std
            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
            
            # 初始化 v_proj 的权重,标准差为 in_proj_std
            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
            
            # 初始化 out_proj 的权重,标准差为 out_proj_std
            out_proj_std = (module.embed_dim**-0.5) * factor
            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
        
        # 如果模块是 AltCLIPMLP 类型
        elif isinstance(module, AltCLIPMLP):
            # 重新设置初始化因子
            factor = self.config.initializer_factor
            
            # 初始化 fc1 的权重,标准差为 fc_std
            in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
            nn.init.normal_(module.fc1.weight, std=fc_std)
            
            # 初始化 fc2 的权重,标准差为 in_proj_std
            nn.init.normal_(module.fc2.weight, std=in_proj_std)
        
        # 如果模块是 AltCLIPModel 类型
        elif isinstance(module, AltCLIPModel):
            # 初始化 text_projection 的权重,标准差为 text_embed_dim 的倒数乘以 factor
            nn.init.normal_(
                module.text_projection.weight,
                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
            )
            # 将 _is_hf_initialized 设置为 True,表示已经初始化
            module.text_projection._is_hf_initialized = True
            
            # 初始化 visual_projection 的权重,标准差为 vision_embed_dim 的倒数乘以 factor
            nn.init.normal_(
                module.visual_projection.weight,
                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
            )
            # 将 _is_hf_initialized 设置为 True,表示已经初始化
            module.visual_projection._is_hf_initialized = True
        
        # 如果模块是 nn.LayerNorm 类型
        elif isinstance(module, nn.LayerNorm):
            # 将偏置数据设为零
            module.bias.data.zero_()
            
            # 将权重数据填充为 1.0
            module.weight.data.fill_(1.0)
        
        # 如果模块是 nn.Linear 类型
        elif isinstance(module, nn.Linear):
            # 初始化权重数据,均值为 0,标准差为初始化因子
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
            
            # 如果存在偏置数据,将其设为零
            if module.bias is not None:
                module.bias.data.zero_()
        
        # 如果模块是 nn.Embedding 类型
        elif isinstance(module, nn.Embedding):
            # 初始化权重数据,均值为 0,标准差为初始化因子
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor)
            
            # 如果有 padding_idx,将对应索引的权重数据设为零
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
# 从transformers.models.clip.modeling_clip.CLIPVisionTransformer复制而来的AltCLIPVisionTransformer类定义,
# 重命名了一些类和常量,以适应当前环境
class AltCLIPVisionTransformer(nn.Module):
    def __init__(self, config: AltCLIPVisionConfig):
        super().__init__()
        self.config = config
        embed_dim = config.hidden_size

        # 使用配置初始化视觉嵌入层对象
        self.embeddings = AltCLIPVisionEmbeddings(config)
        # 应用 LayerNorm 到嵌入层输出,使用配置中的 epsilon 值
        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
        # 初始化 CLIPEncoder 对象,处理视觉输入
        self.encoder = AltCLIPEncoder(config)
        # 应用 LayerNorm 到编码器输出,使用配置中的 epsilon 值
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

    @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)
    # 定义前向传播函数,接受像素值和其他参数,并返回模型输出
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        返回:

        """
        # 如果未提供像素值,则引发错误
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

        # 将像素值转换为嵌入表示
        hidden_states = self.embeddings(pixel_values)
        # 应用预层归一化到嵌入表示
        hidden_states = self.pre_layrnorm(hidden_states)

        # 调用编码器处理嵌入表示,传递其他参数和返回类型
        encoder_outputs = self.encoder(
            inputs_embeds=hidden_states,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 提取编码器的最后隐藏状态
        last_hidden_state = encoder_outputs[0]
        # 提取池化输出,通常是最后隐藏状态的首列
        pooled_output = last_hidden_state[:, 0, :]
        # 应用后层归一化到池化输出
        pooled_output = self.post_layernorm(pooled_output)

        # 如果不要求返回字典,则返回一个元组
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 否则,返回一个包含池化输出和编码器状态的 BaseModelOutputWithPooling 对象
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )


class AltCLIPVisionModel(AltCLIPPreTrainedModel):
    # 设置配置类为 AltCLIPVisionConfig
    config_class = AltCLIPVisionConfig
    # 主要输入名称为 "pixel_values"
    main_input_name = "pixel_values"

    def __init__(self, config: AltCLIPVisionConfig):
        super().__init__(config)
        # 初始化 AltCLIPVisionTransformer 对象作为视觉模型
        self.vision_model = AltCLIPVisionTransformer(config)
        # 初始化权重并应用最终处理
        self.post_init()
    # 返回当前模型的视觉嵌入层模块
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

    # 将此方法的返回值的文档字符串添加到模型的前向传播方法中
    @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING)
    # 替换返回的文档字符串的输出类型,并使用指定的配置类
    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig)
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, BaseModelOutputWithPooling]:
        r"""
        返回模型的前向传播结果。

        返回:
            返回一个包含模型输出的元组或BaseModelOutputWithPooling对象。

        示例:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```"""
        # 如果return_dict为None,则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用视觉模型的前向传播方法,传递输入参数并返回结果
        return self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
class AltRobertaModel(AltCLIPPreTrainedModel):
    """

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762

    """

    config_class = AltCLIPTextConfig

    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->AltRoberta
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        # Initialize the embeddings module for the model based on provided configuration
        self.embeddings = AltRobertaEmbeddings(config)
        
        # Initialize the encoder module for the model based on provided configuration
        self.encoder = AltRobertaEncoder(config)

        # Initialize the pooling layer if `add_pooling_layer` is set to `True`
        self.pooler = AltRobertaPooler(config) if add_pooling_layer else None

        # Initialize model weights and perform final setup
        self.post_init()

    # Retrieve the input word embeddings from the model
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings

    # Set the input word embeddings for the model
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value

    # Prune heads of the model based on the provided `heads_to_prune` dictionary
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        for layer, heads in heads_to_prune.items():
            self.encoder.layer[layer].attention.prune_heads(heads)

    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
    # Forward pass for the model with detailed argument descriptions
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    # 初始化函数,接受配置参数并调用父类的初始化方法
    def __init__(self, config):
        super().__init__(config)
        # 使用 AltRobertaModel 创建 self.roberta 对象,不添加池化层
        self.roberta = AltRobertaModel(config, add_pooling_layer=False)
        # 创建一个线性变换层,从 config.hidden_size 到 config.project_dim
        self.transformation = nn.Linear(config.hidden_size, config.project_dim)
        # 创建一个 LayerNorm 层,对隐藏状态进行归一化,使用 config.layer_norm_eps 作为 epsilon
        self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 执行初始化后处理
        self.post_init()

    # 获取输入嵌入的方法,返回 self.roberta.embeddings.word_embeddings 模块
    def get_input_embeddings(self) -> nn.Module:
        return self.roberta.embeddings.word_embeddings

    # 设置输入嵌入的方法,将 value 赋值给 self.roberta.embeddings.word_embeddings
    def set_input_embeddings(self, value: nn.Embedding) -> None:
        self.roberta.embeddings.word_embeddings = value

    # 调整 token 嵌入的大小的方法,调用父类的 resize_token_embeddings 方法
    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
        return super().resize_token_embeddings(new_num_tokens)

    # 前向传播方法,接受多个输入参数并按照特定的顺序返回结果
    @add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndProjection, config_class=AltCLIPTextConfig)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        # 设置返回字典,如果未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用给定的输入参数调用 RoBERTa 模型进行前向传播
        outputs = self.roberta(
            input_ids=input_ids,                   # 输入的词语 ID
            attention_mask=attention_mask,         # 注意力遮罩,指示哪些元素应该被忽略
            token_type_ids=token_type_ids,         # 用于区分句子 A 和句子 B 的标识
            position_ids=position_ids,             # 位置 ID,指示每个词语的位置
            head_mask=head_mask,                   # 头部遮罩,用于屏蔽特定的注意力头部
            inputs_embeds=inputs_embeds,           # 输入的嵌入表示
            encoder_hidden_states=encoder_hidden_states,     # 编码器的隐藏状态
            encoder_attention_mask=encoder_attention_mask,   # 编码器的注意力遮罩
            output_attentions=output_attentions,   # 是否输出注意力权重
            output_hidden_states=output_hidden_states,       # 是否输出所有隐藏状态
            return_dict=return_dict,               # 是否返回字典格式的输出
        )

        # 获取模型的序列输出(通常是最后一层的隐藏状态)
        sequence_output = outputs[0]

        # 应用预层标准化(LayerNorm)处理序列输出
        sequence_output = self.pre_LN(sequence_output)

        # 应用变换层对处理后的序列输出进行投影
        projection_state = self.transformation(sequence_output)

        # 提取池化输出,通常是投影后的第一个位置
        pooler_output = projection_state[:, 0]

        # 如果不需要返回字典,则返回投影状态、池化输出以及其他输出状态和注意力权重
        if not return_dict:
            return (projection_state, pooler_output) + outputs[2:4]

        # 返回包含池化输出和投影状态的字典格式的输出
        return BaseModelOutputWithPoolingAndProjection(
            last_hidden_state=projection_state,    # 最后的隐藏状态,通常是投影状态
            pooler_output=pooler_output,           # 池化输出,通常是投影后的第一个位置
            hidden_states=outputs.hidden_states,   # 所有隐藏状态的列表
            attentions=outputs.attentions,         # 所有注意力权重的列表
        )
    class AltCLIPModel(AltCLIPPreTrainedModel):
        # 指定配置类为AltCLIPConfig
        config_class = AltCLIPConfig

        def __init__(self, config: AltCLIPConfig):
            # 调用父类构造函数,初始化模型
            super().__init__(config)

            # 检查config.vision_config是否为AltCLIPVisionConfig类型
            if not isinstance(config.vision_config, AltCLIPVisionConfig):
                # 抛出数值错误异常,提示config.vision_config类型错误
                raise ValueError(
                    "config.vision_config is expected to be of type AltCLIPVisionConfig but is of type"
                    f" {type(config.vision_config)}."
                )
            # 检查config.text_config是否为AltCLIPTextConfig类型
            if not isinstance(config.text_config, AltCLIPTextConfig):
                # 抛出数值错误异常,提示config.text_config类型错误
                raise ValueError(
                    "config.text_config is expected to be of type AltCLIPTextConfig but is of type"
                    f" {type(config.text_config)}."
                )

            # 获取text_config和vision_config对象
            text_config = config.text_config
            vision_config = config.vision_config

            # 设置投影维度、文本嵌入维度和视觉嵌入维度
            self.projection_dim = config.projection_dim
            self.text_embed_dim = text_config.project_dim
            self.vision_embed_dim = vision_config.hidden_size

            # 初始化文本模型和视觉模型
            self.text_model = AltCLIPTextModel(text_config)
            self.vision_model = AltCLIPVisionTransformer(vision_config)

            # 创建视觉投影层和文本投影层,无偏置
            self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
            self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)

            # 创建Logit缩放参数
            self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))

            # 执行后初始化过程
            self.post_init()

        @add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING)
        def get_text_features(
            self,
            input_ids: Optional[torch.Tensor] = None,
            attention_mask: Optional[torch.Tensor] = None,
            position_ids: Optional[torch.Tensor] = None,
            token_type_ids=None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
        ) -> torch.FloatTensor:
        r"""
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPTextModel`].

        Examples:

        ```
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```"""
        # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components.
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # Use `self.config.output_hidden_states` if `output_hidden_states` is not provided.
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # Use `self.config.use_return_dict` if `return_dict` is not provided.
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Pass input arguments to the `text_model` of the AltCLIP model and retrieve text outputs.
        text_outputs = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # Extract the pooled output from `text_outputs`.
        pooled_output = text_outputs[1]
        # Project the pooled output to obtain text features.
        text_features = self.text_projection(pooled_output)

        # Return the computed text features.
        return text_features
        ) -> torch.FloatTensor:
        r"""
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

        Examples:

        ```
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```"""
        # 使用 AltCLIP 模型的配置来设置一些字段(如果指定了),而不是使用视觉和文本组件的配置。
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用视觉模型,传入像素值、注意力输出、隐藏状态输出和返回字典等参数
        vision_outputs = self.vision_model(
            pixel_values=pixel_values,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从视觉模型的输出中获取池化后的特征向量
        pooled_output = vision_outputs[1]  # pooled_output
        # 将池化后的特征向量投影到特征空间
        image_features = self.visual_projection(pooled_output)

        # 返回图像特征向量
        return image_features

    @add_start_docstrings_to_model_forward(ALTCLIP_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=AltCLIPOutput, config_class=AltCLIPConfig)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        return_loss: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 从输入的输入标识创建位置标识的函数,源自transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids

def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
    """
    用输入标识替换非填充符号的位置编号。位置编号从padding_idx + 1开始,填充符号将被忽略。这是根据fairseq的`utils.make_positions`修改的。

    Args:
        input_ids: 输入的标识序列,torch.Tensor类型
        padding_idx: 填充标识的索引,用来确定哪些是填充符号
        past_key_values_length: 过去键值对的长度,用于计算增量索引,默认为0

    Returns:
        incremental_indices: torch.Tensor,包含输入标识的位置标识
    """

    # 创建一个掩码,标记不是填充符号的位置为1,其余为0
    mask = input_ids.ne(padding_idx).int()
    
    # 计算累积和,再加上过去键值对的长度,乘以掩码确保只在非填充符号位置生效
    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
    
    # 最后将计算出的位置索引转换为长整型,并加上填充索引,以得到最终的位置标识
    return incremental_indices.long() + padding_idx

.\models\altclip\processing_altclip.py

# coding=utf-8
# Copyright 2022 WenXiang ZhongzhiCheng LedellWu LiuGuang BoWenZhang The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Image/Text processor class for AltCLIP
"""
import warnings

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding

class AltCLIPProcessor(ProcessorMixin):
    r"""
    Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single
    processor.

    [`AltCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`XLMRobertaTokenizerFast`]. See
    the [`~AltCLIPProcessor.__call__`] and [`~AltCLIPProcessor.decode`] for more information.

    Args:
        image_processor ([`CLIPImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
            The tokenizer is a required input.
    """

    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "CLIPImageProcessor"
    tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")

    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
        # Deprecated feature_extractor handling
        feature_extractor = None
        if "feature_extractor" in kwargs:
            warnings.warn(
                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
                " instead.",
                FutureWarning,
            )
            feature_extractor = kwargs.pop("feature_extractor")

        # Set image_processor to feature_extractor if image_processor is None
        image_processor = image_processor if image_processor is not None else feature_extractor
        if image_processor is None:
            raise ValueError("You need to specify an `image_processor`.")
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")

        # Initialize with image_processor and tokenizer
        super().__init__(image_processor, tokenizer)

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to XLMRobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
        Please refer to the docstring of this method for more information.
        """
        # Delegate batch decoding to tokenizer
        return self.tokenizer.batch_decode(*args, **kwargs)
    # 将所有参数转发给 XLMRobertaTokenizerFast 的 `~PreTrainedTokenizer.decode` 方法
    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to XLMRobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
        refer to the docstring of this method for more information.
        """
        # 调用 tokenizer 对象的 decode 方法,将参数传递给它,并返回结果
        return self.tokenizer.decode(*args, **kwargs)

    # 返回模型输入的名称列表,合并并去重 tokenizer 和 image_processor 的输入名称
    @property
    def model_input_names(self):
        # 获取 tokenizer 的模型输入名称列表
        tokenizer_input_names = self.tokenizer.model_input_names
        # 获取 image_processor 的模型输入名称列表
        image_processor_input_names = self.image_processor.model_input_names
        # 合并两个列表,使用字典去重后转换为列表,并返回结果
        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

.\models\altclip\__init__.py

# 版权声明和许可证信息,指明版权归 The HuggingFace Team 所有,使用 Apache License, Version 2.0 许可
#
# 引入必要的模块和函数
from typing import TYPE_CHECKING
# 从 ...utils 中导入相关模块,处理依赖未安装的情况
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构,包括配置、处理和模型的列表
_import_structure = {
    "configuration_altclip": [
        "ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "AltCLIPConfig",
        "AltCLIPTextConfig",
        "AltCLIPVisionConfig",
    ],
    "processing_altclip": ["AltCLIPProcessor"],
}

# 检查是否安装了 torch,若未安装则引发 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 torch,则将 modeling_altclip 模块添加到导入结构中
    _import_structure["modeling_altclip"] = [
        "ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST",
        "AltCLIPPreTrainedModel",
        "AltCLIPModel",
        "AltCLIPTextModel",
        "AltCLIPVisionModel",
    ]

# 如果是类型检查阶段,则从相应模块中导入具体的类和常量
if TYPE_CHECKING:
    from .configuration_altclip import (
        ALTCLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
        AltCLIPConfig,
        AltCLIPTextConfig,
        AltCLIPVisionConfig,
    )
    from .processing_altclip import AltCLIPProcessor

    # 再次检查是否安装了 torch,若未安装则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 torch,则从 modeling_altclip 模块导入相关类和常量
        from .modeling_altclip import (
            ALTCLIP_PRETRAINED_MODEL_ARCHIVE_LIST,
            AltCLIPModel,
            AltCLIPPreTrainedModel,
            AltCLIPTextModel,
            AltCLIPVisionModel,
        )

# 如果不是类型检查阶段,则定义一个 LazyModule 并将其设置为当前模块的代理
else:
    import sys

    # 将当前模块的 sys.modules 设置为 LazyModule 对象,用于延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\audio_spectrogram_transformer\configuration_audio_spectrogram_transformer.py

# coding=utf-8
# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Audio Spectogram Transformer (AST) model configuration
"""

# 从相应的库中导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 导入日志记录工具
from ...utils import logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义预训练模型配置文件的映射字典,将模型名称映射到配置文件的下载链接
AUDIO_SPECTROGRAM_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "MIT/ast-finetuned-audioset-10-10-0.4593": (
        "https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593/resolve/main/config.json"
    ),
}


class ASTConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ASTModel`]. It is used to instantiate an AST
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the AST
    [MIT/ast-finetuned-audioset-10-10-0.4593](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
    architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 设置模型类型为音频频谱变换器
    model_type = "audio-spectrogram-transformer"
    # 初始化函数,用于初始化 Transformer 模型的参数
    def __init__(
        self,
        hidden_size=768,  # 设置隐藏层的大小,默认为768
        num_hidden_layers=12,  # Transformer 模型中的隐藏层数,默认为12
        num_attention_heads=12,  # 每个注意力头的数量,默认为12
        intermediate_size=3072,  # Transformer 中间层的大小,默认为3072
        hidden_act="gelu",  # 隐藏层激活函数的选择,默认为 GELU
        hidden_dropout_prob=0.0,  # 隐藏层的 dropout 概率,默认为0.0,即不进行 dropout
        attention_probs_dropout_prob=0.0,  # 注意力层的 dropout 概率,默认为0.0,即不进行 dropout
        initializer_range=0.02,  # 参数初始化范围,默认为0.02
        layer_norm_eps=1e-12,  # Layer normalization 的 epsilon,默认为 1e-12
        patch_size=16,  # 图像块的大小,默认为16
        qkv_bias=True,  # 是否在 QKV 层中使用偏置,默认为 True
        frequency_stride=10,  # 频率维度的步长,默认为10
        time_stride=10,  # 时间维度的步长,默认为10
        max_length=1024,  # 最大序列长度,默认为1024
        num_mel_bins=128,  # Mel 频谱的频道数,默认为128
        **kwargs,
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)

        # 将参数赋值给对象的属性
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.patch_size = patch_size
        self.qkv_bias = qkv_bias
        self.frequency_stride = frequency_stride
        self.time_stride = time_stride
        self.max_length = max_length
        self.num_mel_bins = num_mel_bins

.\models\audio_spectrogram_transformer\convert_audio_spectrogram_transformer_original_to_pytorch.py

# 设置文件编码为 UTF-8

# 版权声明,声明代码归 HuggingFace Inc. 团队所有,采用 Apache License 2.0 版本进行许可
# 除非符合许可,否则不得使用该文件
# 可在以下链接获取许可协议内容:http://www.apache.org/licenses/LICENSE-2.0

"""从原始仓库转换音频频谱变换器检查点。URL: https://github.com/YuanGongND/ast"""

# 导入必要的库和模块
import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 格式数据的模块
from pathlib import Path  # 导入处理路径操作的模块

import torch  # 导入 PyTorch 深度学习库
import torchaudio  # 导入处理音频数据的 PyTorch 扩展模块
from datasets import load_dataset  # 导入加载数据集的函数
from huggingface_hub import hf_hub_download  # 导入下载 Hugging Face Hub 模型的函数

from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification  # 导入音频频谱变换器相关的类
from transformers.utils import logging  # 导入日志记录工具

# 设置日志记录的详细程度为信息级别
logging.set_verbosity_info()

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)


def get_audio_spectrogram_transformer_config(model_name):
    # 创建一个音频频谱变换器配置对象
    config = ASTConfig()

    # 根据模型名称设置不同的配置参数
    if "10-10" in model_name:
        pass  # 如果模型名称包含 "10-10",则不修改配置
    elif "speech-commands" in model_name:
        config.max_length = 128  # 如果模型名称包含 "speech-commands",设置最大长度为 128
    elif "12-12" in model_name:
        config.time_stride = 12  # 如果模型名称包含 "12-12",设置时间步长为 12
        config.frequency_stride = 12  # 设置频率步长为 12
    elif "14-14" in model_name:
        config.time_stride = 14  # 如果模型名称包含 "14-14",设置时间步长为 14
        config.frequency_stride = 14  # 设置频率步长为 14
    elif "16-16" in model_name:
        config.time_stride = 16  # 如果模型名称包含 "16-16",设置时间步长为 16
        config.frequency_stride = 16  # 设置频率步长为 16
    else:
        raise ValueError("Model not supported")  # 如果模型名称不在支持列表中,抛出数值错误异常

    # 设置仓库 ID 用于下载标签文件
    repo_id = "huggingface/label-files"

    # 根据模型名称进一步设置配置对象的属性
    if "speech-commands" in model_name:
        config.num_labels = 35  # 如果模型名称包含 "speech-commands",设置标签数量为 35
        filename = "speech-commands-v2-id2label.json"  # 设置要下载的标签文件名
    else:
        config.num_labels = 527  # 否则,设置标签数量为 527
        filename = "audioset-id2label.json"  # 设置要下载的标签文件名

    # 使用 Hugging Face Hub 下载指定仓库 ID 和文件名的 JSON 文件,并加载为 Python 字典
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    # 将加载的标签字典中的键转换为整数类型,值保持不变
    id2label = {int(k): v for k, v in id2label.items()}
    config.id2label = id2label  # 将转换后的标签字典赋值给配置对象的 id2label 属性
    config.label2id = {v: k for k, v in id2label.items()}  # 创建标签到 ID 的反向映射字典

    return config  # 返回配置对象


def rename_key(name):
    # 根据特定规则重命名输入的键名字符串,并返回重命名后的结果

    if "module.v" in name:
        name = name.replace("module.v", "audio_spectrogram_transformer")
    if "cls_token" in name:
        name = name.replace("cls_token", "embeddings.cls_token")
    if "dist_token" in name:
        name = name.replace("dist_token", "embeddings.distillation_token")
    if "pos_embed" in name:
        name = name.replace("pos_embed", "embeddings.position_embeddings")
    if "patch_embed.proj" in name:
        name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
    # 替换 transformer blocks 相关的键名
    if "blocks" in name:
        name = name.replace("blocks", "encoder.layer")
    if "attn.proj" in name:
        name = name.replace("attn.proj", "attention.output.dense")
    if "attn" in name:
        name = name.replace("attn", "attention.self")

    # 返回修改后的键名字符串
    return name
    # 如果变量 name 中包含字符串 "norm1",则将其替换为 "layernorm_before"
    if "norm1" in name:
        name = name.replace("norm1", "layernorm_before")
    # 如果变量 name 中包含字符串 "norm2",则将其替换为 "layernorm_after"
    if "norm2" in name:
        name = name.replace("norm2", "layernorm_after")
    # 如果变量 name 中包含字符串 "mlp.fc1",则将其替换为 "intermediate.dense"
    if "mlp.fc1" in name:
        name = name.replace("mlp.fc1", "intermediate.dense")
    # 如果变量 name 中包含字符串 "mlp.fc2",则将其替换为 "output.dense"
    if "mlp.fc2" in name:
        name = name.replace("mlp.fc2", "output.dense")
    # 如果变量 name 中包含字符串 "audio_spectrogram_transformer.norm",则将其替换为 "audio_spectrogram_transformer.layernorm"
    # 这一步是为了兼容不同命名规范下的模型参数
    if "audio_spectrogram_transformer.norm" in name:
        name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm")
    # 如果变量 name 中包含字符串 "module.mlp_head.0",则将其替换为 "classifier.layernorm"
    # 这一步是为了重命名分类器头部的层归一化层
    if "module.mlp_head.0" in name:
        name = name.replace("module.mlp_head.0", "classifier.layernorm")
    # 如果变量 name 中包含字符串 "module.mlp_head.1",则将其替换为 "classifier.dense"
    # 这一步是为了重命名分类器头部的全连接层
    if "module.mlp_head.1" in name:
        name = name.replace("module.mlp_head.1", "classifier.dense")

    # 返回经过处理的最终变量 name
    return name
# 将原始状态字典转换为新配置的状态字典
def convert_state_dict(orig_state_dict, config):
    # 遍历原始状态字典的拷贝中的每个键
    for key in orig_state_dict.copy().keys():
        # 弹出当前键对应的值
        val = orig_state_dict.pop(key)

        # 如果键名包含 "qkv"
        if "qkv" in key:
            # 根据 "." 分割键名
            key_split = key.split(".")
            # 获取层号,这里假设层号在第4个位置
            layer_num = int(key_split[3])
            # 获取隐藏层大小
            dim = config.hidden_size
            # 如果键名包含 "weight"
            if "weight" in key:
                # 更新状态字典中的 query、key、value 的权重参数
                orig_state_dict[
                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight"
                ] = val[:dim, :]
                orig_state_dict[
                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight"
                ] = val[dim : dim * 2, :]
                orig_state_dict[
                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight"
                ] = val[-dim:, :]
            else:
                # 更新状态字典中的 query、key、value 的偏置参数
                orig_state_dict[
                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias"
                ] = val[:dim]
                orig_state_dict[
                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias"
                ] = val[dim : dim * 2]
                orig_state_dict[
                    f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias"
                ] = val[-dim:]
        else:
            # 如果键名不包含 "qkv",则重命名键并保留其对应的值
            orig_state_dict[rename_key(key)] = val

    # 返回更新后的原始状态字典
    return orig_state_dict


# 从状态字典中移除指定的键
def remove_keys(state_dict):
    # 需要忽略的键列表
    ignore_keys = [
        "module.v.head.weight",
        "module.v.head.bias",
        "module.v.head_dist.weight",
        "module.v.head_dist.bias",
    ]
    # 遍历忽略键列表,从状态字典中移除对应的键
    for k in ignore_keys:
        state_dict.pop(k, None)


# 在没有梯度更新的情况下执行函数
@torch.no_grad()
def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
    """
    将模型的权重复制/粘贴/调整到我们的音频频谱变换器结构中。
    """
    # 获取音频频谱变换器的配置
    config = get_audio_spectrogram_transformer_config(model_name)
    # 模型名称到预训练模型权重文件下载链接的映射字典
    model_name_to_url = {
        "ast-finetuned-audioset-10-10-0.4593": (
            "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1"
        ),
        "ast-finetuned-audioset-10-10-0.450": (
            "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1"
        ),
        "ast-finetuned-audioset-10-10-0.448": (
            "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1"
        ),
        "ast-finetuned-audioset-10-10-0.448-v2": (
            "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1"
        ),
        "ast-finetuned-audioset-12-12-0.447": (
            "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1"
        ),
        "ast-finetuned-audioset-14-14-0.443": (
            "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1"
        ),
        "ast-finetuned-audioset-16-16-0.442": (
            "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1"
        ),
        "ast-finetuned-speech-commands-v2": (
            "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1"
        ),
    }

    # 加载原始状态字典
    checkpoint_url = model_name_to_url[model_name]
    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
    # 移除部分键
    remove_keys(state_dict)
    # 重命名部分键
    new_state_dict = convert_state_dict(state_dict, config)

    # 加载 🤗 模型
    model = ASTForAudioClassification(config)
    model.eval()

    model.load_state_dict(new_state_dict)

    # 在虚拟输入上验证输出
    # 来源:https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62
    mean = -4.2677393 if "speech-commands" not in model_name else -6.845978
    std = 4.5689974 if "speech-commands" not in model_name else 5.5654526
    max_length = 1024 if "speech-commands" not in model_name else 128
    feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length)

    if "speech-commands" in model_name:
        # 加载 "speech-commands" 数据集的验证集
        dataset = load_dataset("speech_commands", "v0.02", split="validation")
        waveform = dataset[0]["audio"]["array"]
    else:
        # 下载样本音频文件
        filepath = hf_hub_download(
            repo_id="nielsr/audio-spectogram-transformer-checkpoint",
            filename="sample_audio.flac",
            repo_type="dataset",
        )

        # 加载音频文件并转换为 NumPy 数组
        waveform, _ = torchaudio.load(filepath)
        waveform = waveform.squeeze().numpy()

    # 使用特征提取器处理波形数据
    inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt")

    # 前向传播
    outputs = model(**inputs)
    logits = outputs.logits

    if model_name == "ast-finetuned-audioset-10-10-0.4593":
        expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602])
    elif model_name == "ast-finetuned-audioset-10-10-0.450":
        expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718])
    # 根据模型名称设置预期的输出向量片段
    elif model_name == "ast-finetuned-audioset-10-10-0.448":
        expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344])
    elif model_name == "ast-finetuned-audioset-10-10-0.448-v2":
        expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917])
    elif model_name == "ast-finetuned-audioset-12-12-0.447":
        expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843])
    elif model_name == "ast-finetuned-audioset-14-14-0.443":
        expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413])
    elif model_name == "ast-finetuned-audioset-16-16-0.442":
        expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470])
    elif model_name == "ast-finetuned-speech-commands-v2":
        expected_slice = torch.tensor([6.1589, -8.0566, -8.7984])
    else:
        # 如果模型名称未知,则引发值错误异常
        raise ValueError("Unknown model name")
    
    # 检查模型输出的前三个元素是否与预期的向量片段非常接近,如果不是,则引发值错误异常
    if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4):
        raise ValueError("Logits don't match")
    
    # 打印提示信息,表示检查通过
    print("Looks ok!")

    # 如果指定了 PyTorch 模型保存路径,则执行以下操作
    if pytorch_dump_folder_path is not None:
        # 确保指定路径存在,如果不存在则创建
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印模型保存的信息,包括模型名称和保存路径
        print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
        # 将模型保存到指定路径
        model.save_pretrained(pytorch_dump_folder_path)
        # 打印特征提取器保存的信息,包括保存路径
        print(f"Saving feature extractor to {pytorch_dump_folder_path}")
        # 将特征提取器保存到指定路径
        feature_extractor.save_pretrained(pytorch_dump_folder_path)

    # 如果需要推送到 Hub
    if push_to_hub:
        # 打印推送模型和特征提取器到 Hub 的提示信息
        print("Pushing model and feature extractor to the hub...")
        # 将模型推送到指定 Hub 路径
        model.push_to_hub(f"MIT/{model_name}")
        # 将特征提取器推送到指定 Hub 路径
        feature_extractor.push_to_hub(f"MIT/{model_name}")
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行,则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--model_name",
        default="ast-finetuned-audioset-10-10-0.4593",
        type=str,
        help="Name of the Audio Spectrogram Transformer model you'd like to convert."
    )
    # 添加参数:模型名称,指定默认值和帮助信息

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加参数:PyTorch 模型输出目录的路径,支持默认值和帮助信息

    parser.add_argument(
        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
    )
    # 添加参数:是否将转换后的模型推送到 🤗 hub,采用布尔型标志

    args = parser.parse_args()
    # 解析命令行参数并存储到 args 对象中

    convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 convert_audio_spectrogram_transformer_checkpoint,传递解析得到的参数
posted @ 2024-06-30 15:32  绝不原创的飞龙  阅读(60)  评论(0)    收藏  举报