Transformers-源码解析-十八-

Transformers 源码解析（十八）

`.\models\big_bird\init.py`

# 导入必要的类型检查模块
from typing import TYPE_CHECKING

# 导入相关的依赖项和模块
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_sentencepiece_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义模块的导入结构，包含需要导入的模块和对应的标识符
_import_structure = {
    "configuration_big_bird": ["BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP", "BigBirdConfig", "BigBirdOnnxConfig"],
}

# 检查是否可用句子分割模块，若不可用则引发异常
try:
    if not is_sentencepiece_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加 tokenization_big_bird 模块到导入结构中
    _import_structure["tokenization_big_bird"] = ["BigBirdTokenizer"]

# 检查是否可用 Tokenizers 库，若不可用则引发异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加 tokenization_big_bird_fast 模块到导入结构中
    _import_structure["tokenization_big_bird_fast"] = ["BigBirdTokenizerFast"]

# 检查是否可用 Torch 库，若不可用则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加 modeling_big_bird 模块到导入结构中
    _import_structure["modeling_big_bird"] = [
        "BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BigBirdForCausalLM",
        "BigBirdForMaskedLM",
        "BigBirdForMultipleChoice",
        "BigBirdForPreTraining",
        "BigBirdForQuestionAnswering",
        "BigBirdForSequenceClassification",
        "BigBirdForTokenClassification",
        "BigBirdLayer",
        "BigBirdModel",
        "BigBirdPreTrainedModel",
        "load_tf_weights_in_big_bird",
    ]

# 检查是否可用 Flax 库，若不可用则引发异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 若可用，则添加 modeling_flax_big_bird 模块到导入结构中
    _import_structure["modeling_flax_big_bird"] = [
        "FlaxBigBirdForCausalLM",
        "FlaxBigBirdForMaskedLM",
        "FlaxBigBirdForMultipleChoice",
        "FlaxBigBirdForPreTraining",
        "FlaxBigBirdForQuestionAnswering",
        "FlaxBigBirdForSequenceClassification",
        "FlaxBigBirdForTokenClassification",
        "FlaxBigBirdModel",
        "FlaxBigBirdPreTrainedModel",
    ]

# 如果在类型检查模式下，导入额外的配置和模块
if TYPE_CHECKING:
    # 导入具体的配置、模型和标识符
    from .configuration_big_bird import BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdConfig, BigBirdOnnxConfig

    # 再次检查句子分割模块是否可用，若不可用则引发异常
    try:
        if not is_sentencepiece_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 若可用，则从 tokenization_big_bird 模块导入 BigBirdTokenizer 类
        from .tokenization_big_bird import BigBirdTokenizer
    # 尝试检查是否安装了 tokenizers 库，若未安装则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，表示 tokenizers 库不可用
    except OptionalDependencyNotAvailable:
        # 忽略异常，继续执行
        pass
    else:
        # 若 tokenizers 可用，则从本地目录导入 BigBirdTokenizerFast
        from .tokenization_big_bird_fast import BigBirdTokenizerFast

    # 尝试检查是否安装了 torch 库，若未安装则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，表示 torch 库不可用
    except OptionalDependencyNotAvailable:
        # 忽略异常，继续执行
        pass
    else:
        # 若 torch 可用，则从本地目录导入以下模块：
        # BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST,
        # BigBirdForCausalLM,
        # BigBirdForMaskedLM,
        # BigBirdForMultipleChoice,
        # BigBirdForPreTraining,
        # BigBirdForQuestionAnswering,
        # BigBirdForSequenceClassification,
        # BigBirdForTokenClassification,
        # BigBirdLayer,
        # BigBirdModel,
        # BigBirdPreTrainedModel,
        # load_tf_weights_in_big_bird
        from .modeling_big_bird import (
            BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST,
            BigBirdForCausalLM,
            BigBirdForMaskedLM,
            BigBirdForMultipleChoice,
            BigBirdForPreTraining,
            BigBirdForQuestionAnswering,
            BigBirdForSequenceClassification,
            BigBirdForTokenClassification,
            BigBirdLayer,
            BigBirdModel,
            BigBirdPreTrainedModel,
            load_tf_weights_in_big_bird,
        )

    # 尝试检查是否安装了 flax 库，若未安装则引发 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    # 捕获 OptionalDependencyNotAvailable 异常，表示 flax 库不可用
    except OptionalDependencyNotAvailable:
        # 忽略异常，继续执行
        pass
    else:
        # 若 flax 可用，则从本地目录导入以下模块：
        # FlaxBigBirdForCausalLM,
        # FlaxBigBirdForMaskedLM,
        # FlaxBigBirdForMultipleChoice,
        # FlaxBigBirdForPreTraining,
        # FlaxBigBirdForQuestionAnswering,
        # FlaxBigBirdForSequenceClassification,
        # FlaxBigBirdForTokenClassification,
        # FlaxBigBirdModel,
        # FlaxBigBirdPreTrainedModel
        from .modeling_flax_big_bird import (
            FlaxBigBirdForCausalLM,
            FlaxBigBirdForMaskedLM,
            FlaxBigBirdForMultipleChoice,
            FlaxBigBirdForPreTraining,
            FlaxBigBirdForQuestionAnswering,
            FlaxBigBirdForSequenceClassification,
            FlaxBigBirdForTokenClassification,
            FlaxBigBirdModel,
            FlaxBigBirdPreTrainedModel,
        )
# 如果上述条件不满足，即导入模块失败，则执行以下操作
else:
    # 导入 sys 模块
    import sys
    # 将当前模块更新到 sys.modules 中
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\biogpt\configuration_biogpt.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Team and Microsoft Research AI4Science All rights reserved.
# 上面是版权声明和编码声明

# 导入必要的模块和函数
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义一个字典，映射 BioGPT 预训练模型名称到配置文件的 URL
BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/biogpt": "https://huggingface.co/microsoft/biogpt/resolve/main/config.json",
    # 可以在 https://huggingface.co/models?filter=biogpt 查看所有的 BioGPT 模型
}

# 定义 BioGptConfig 类，继承自 PretrainedConfig 类
class BioGptConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`BioGptModel`]. It is used to instantiate an
    BioGPT model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the BioGPT
    [microsoft/biogpt](https://huggingface.co/microsoft/biogpt) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义了 BioGPT 模型的配置类，包含了模型的各种参数设置
    
    class BioGptConfig:
        # 构造函数，初始化配置参数
        def __init__(
            self,
            # 词汇表大小，默认为 42384，定义了模型可以表示的不同 token 的数量
            vocab_size=42384,
            # 隐藏层大小，默认为 1024，定义了编码器层和池化层的维度
            hidden_size=1024,
            # Transformer 编码器中的隐藏层数，默认为 24
            num_hidden_layers=24,
            # Transformer 编码器中每个注意力层的注意力头数，默认为 16
            num_attention_heads=16,
            # Transformer 编码器中“中间”（即前馈）层的维度，默认为 4096
            intermediate_size=4096,
            # 编码器和池化器中的非线性激活函数，默认为 "gelu"
            hidden_act="gelu",
            # 嵌入层、编码器和池化器中所有全连接层的 dropout 概率，默认为 0.1
            hidden_dropout_prob=0.1,
            # 注意力概率的 dropout 比率，默认为 0.1
            attention_probs_dropout_prob=0.1,
            # 可能会用到的最大序列长度，通常设置为较大值（例如 512、1024 或 2048）
            max_position_embeddings=1024,
            # 用于初始化所有权重矩阵的截断正态初始化器的标准差，默认为 0.02
            initializer_range=0.02,
            # 层归一化层使用的 epsilon，默认为 1e-12
            layer_norm_eps=1e-12,
            # 是否通过将嵌入除以 sqrt(d_model) 来缩放嵌入，默认为 True
            scale_embedding=True,
            # 模型是否应返回最后一组键/值注意力，默认为 True，仅在 config.is_decoder=True 时相关
            use_cache=True,
            # LayerDrop 参数，请参考论文 https://arxiv.org/abs/1909.11556 获取详细信息，默认为 0.0
            layerdrop=0.0,
            # 全连接层内部激活的 dropout 比率，默认为 0.0
            activation_dropout=0.0,
            # 填充标记的 id，默认为 1
            pad_token_id=1,
            # 流的起始标记 id，默认为 0
            bos_token_id=0,
            # 流的结束标记 id，默认为 2
            eos_token_id=2,
        ):
            # 将各个参数赋值给对应的实例变量
            self.vocab_size = vocab_size
            self.hidden_size = hidden_size
            self.num_hidden_layers = num_hidden_layers
            self.num_attention_heads = num_attention_heads
            self.intermediate_size = intermediate_size
            self.hidden_act = hidden_act
            self.hidden_dropout_prob = hidden_dropout_prob
            self.attention_probs_dropout_prob = attention_probs_dropout_prob
            self.max_position_embeddings = max_position_embeddings
            self.initializer_range = initializer_range
            self.layer_norm_eps = layer_norm_eps
            self.scale_embedding = scale_embedding
            self.use_cache = use_cache
            self.layerdrop = layerdrop
            self.activation_dropout = activation_dropout
            self.pad_token_id = pad_token_id
            self.bos_token_id = bos_token_id
            self.eos_token_id = eos_token_id
    # Initializing a BioGPT configuration object with default parameters
    configuration = BioGptConfig()
    
    # Initializing a BioGPT model using the configuration object created above
    model = BioGptModel(configuration)
    
    # Accessing the configuration attributes from the model
    configuration = model.config

`.\models\biogpt\convert_biogpt_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# 声明编码格式为 UTF-8

# Copyright 2022 The HuggingFace Inc. team.
# 版权声明，版权归 HuggingFace Inc. 团队所有。

# Licensed under the Apache License, Version 2.0 (the "License");
# 授权许可信息，使用 Apache License, Version 2.0 许可证。

# you may not use this file except in compliance with the License.
# 在遵守许可证的前提下，您不得使用本文件。

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证副本：

#     http://www.apache.org/licenses/LICENSE-2.0
#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则按"原样"分发本软件，
# 无论是明示的还是暗示的，包括但不限于对适销性和特定用途的适用性的暗示保证或条件。

# See the License for the specific language governing permissions and
# limitations under the License.
# 请查阅许可证了解权限和限制。

# 导入必要的库和模块
import argparse  # 参数解析模块
import json  # JSON 格式处理模块
import os  # 操作系统相关功能模块
import re  # 正则表达式模块
import shutil  # 文件操作模块

# 导入 PyTorch 库
import torch

# 导入 transformers 库中的配置和模型
from transformers import BioGptConfig, BioGptForCausalLM
# 从 biogpt 模型的 tokenization_biogpt 模块导入词汇文件相关常量
from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES
# 从 transformers 库的 tokenization_utils_base 模块导入 TOKENIZER_CONFIG_FILE 常量
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
# 从 transformers 库的 utils 模块导入 WEIGHTS_NAME 和 logging 函数
from transformers.utils import WEIGHTS_NAME, logging

# 设置 logging 的警告级别为警告以上
logging.set_verbosity_warning()

# 设置 JSON 输出时的缩进量
json_indent = 2

# modified from https://github.com/facebookresearch/fairseq/blob/dd74992d0d143155998e9ed4076826bcea80fb06/fairseq/data/dictionary.py#L18
# 从 Fairseq 项目的代码中修改而来，用于创建字典映射符号到整数的类
class Dictionary:
    """A mapping from symbols to consecutive integers"""

    def __init__(
        self,
        *,  # begin keyword-only arguments
        bos="<s>",  # 句子开始符号，默认为 "<s>"
        pad="<pad>",  # 填充符号，默认为 "<pad>"
        eos="</s>",  # 句子结束符号，默认为 "</s>"
        unk="<unk>",  # 未知符号，默认为 "<unk>"
        extra_special_symbols=None,  # 额外的特殊符号列表，默认为 None
    ):
        # 初始化各个符号
        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
        # 符号列表、符号计数、符号索引字典的初始化
        self.symbols = []
        self.count = []
        self.indices = {}
        # 添加开始、填充、结束、未知符号到符号列表中，并获取它们的索引
        self.bos_index = self.add_symbol(bos)
        self.pad_index = self.add_symbol(pad)
        self.eos_index = self.add_symbol(eos)
        self.unk_index = self.add_symbol(unk)
        # 如果存在额外的特殊符号，则将其逐个添加到符号列表中
        if extra_special_symbols:
            for s in extra_special_symbols:
                self.add_symbol(s)
        # 特殊符号的数量
        self.nspecial = len(self.symbols)

    def __eq__(self, other):
        # 检查当前字典对象与另一个字典对象是否相等
        return self.indices == other.indices

    def __getitem__(self, idx):
        # 获取指定索引处的符号，若索引超出范围则返回未知符号
        if idx < len(self.symbols):
            return self.symbols[idx]
        return self.unk_word

    def __len__(self):
        """Returns the number of symbols in the dictionary"""
        # 返回字典中符号的数量
        return len(self.symbols)

    def __contains__(self, sym):
        # 检查指定符号是否存在于字典的索引中
        return sym in self.indices

    @classmethod
    def load(cls, f):
        """Loads the dictionary from a text file with the format:

        ```
        <symbol0> <count0>
        <symbol1> <count1>
        ...
        ```
        """
        # 从文本文件加载字典，文件格式为每行一个符号及其计数
        d = cls()
        d.add_from_file(f)
        return d
    # 将一个单词添加到字典中，如果单词已存在且不允许覆盖，则增加其出现次数
    def add_symbol(self, word, n=1, overwrite=False):
        """Adds a word to the dictionary"""
        # 如果单词已存在且不允许覆盖，则增加该单词的计数
        if word in self.indices and not overwrite:
            idx = self.indices[word]
            self.count[idx] = self.count[idx] + n
            return idx
        else:
            # 否则将单词添加到字典中，并设置其初始计数
            idx = len(self.symbols)
            self.indices[word] = idx
            self.symbols.append(word)
            self.count.append(n)
            return idx

    # 加载元数据（暂未实现具体逻辑）
    def _load_meta(self, lines):
        return 0

    # 从文件中加载预先存在的字典，将其符号添加到当前实例中
    def add_from_file(self, f):
        """
        Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
        """
        # 如果输入参数是字符串，则尝试打开文件进行递归调用
        if isinstance(f, str):
            try:
                with open(f, "r", encoding="utf-8") as fd:
                    self.add_from_file(fd)
            except FileNotFoundError as fnfe:
                raise fnfe
            except UnicodeError:
                raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f))
            return

        # 读取文件的所有行
        lines = f.readlines()
        # 获取元数据的起始行数（未实现具体逻辑）
        indices_start_line = self._load_meta(lines)

        # 遍历文件中的每一行（从起始行开始）
        for line in lines[indices_start_line:]:
            try:
                # 拆分行，获取单词和计数
                line, field = line.rstrip().rsplit(" ", 1)
                # 检查是否有 "#fairseq:overwrite" 标记，决定是否覆盖已存在的单词
                if field == "#fairseq:overwrite":
                    overwrite = True
                    line, field = line.rsplit(" ", 1)
                else:
                    overwrite = False
                count = int(field)
                word = line
                # 如果字典中已存在该单词且不允许覆盖，则抛出异常
                if word in self and not overwrite:
                    raise RuntimeError(
                        "Duplicate word found when loading Dictionary: '{}'. "
                        "Duplicate words can overwrite earlier ones by adding the "
                        "#fairseq:overwrite flag at the end of the corresponding row "
                        "in the dictionary file. If using the Camembert model, please "
                        "download an updated copy of the model file.".format(word)
                    )
                # 将单词添加到字典中
                self.add_symbol(word, n=count, overwrite=overwrite)
            except ValueError:
                raise ValueError("Incorrect dictionary format, expected '<token> <cnt> [flags]'")
def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path):
    # 检查源文件路径是否存在，若不存在则抛出数值错误异常
    if not os.path.exists(biogpt_checkpoint_path):
        raise ValueError(f"path {biogpt_checkpoint_path} does not exist!")
    
    # 创建目标文件夹路径，如果已存在则不创建
    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
    # 打印将结果写入的目标路径信息
    print(f"Writing results to {pytorch_dump_folder_path}")

    # 处理各种类型的模型

    # 拼接检查点文件路径
    checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt")
    # 如果检查点文件不存在，则抛出数值错误异常
    if not os.path.isfile(checkpoint_file):
        raise ValueError(f"path to the file {checkpoint_file} does not exist!")
    # 加载检查点文件
    chkpt = torch.load(checkpoint_file, map_location="cpu")

    # 获取模型配置信息
    args = chkpt["cfg"]["model"]

    # 加载词典文件
    dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt")
    # 如果词典文件不存在，则抛出数值错误异常
    if not os.path.isfile(dict_file):
        raise ValueError(f"path to the file {dict_file} does not exist!")
    # 加载并重写词典键值
    src_dict = Dictionary.load(dict_file)
    src_vocab = rewrite_dict_keys(src_dict.indices)
    src_vocab_size = len(src_vocab)
    # 拼接目标词汇文件路径，并打印生成信息
    src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"])
    print(f"Generating {src_vocab_file} of {src_vocab_size} records")
    # 将重写后的词典写入目标词汇文件
    with open(src_vocab_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent))

    # 拼接并复制 BPE 代码文件到目标文件路径
    bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes")
    # 如果 BPE 代码文件不存在，则抛出数值错误异常
    if not os.path.isfile(bpecodes_file):
        raise ValueError(f"path to the file {bpecodes_file} does not exist!")
    # 拼接并复制 BPE 代码文件到目标文件路径
    merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"])
    shutil.copyfile(bpecodes_file, merges_file)

    # 拼接并创建 Biogpt 模型配置文件路径
    biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json")
    # 定义模型配置字典，包含模型的各种配置参数
    model_conf = {
        "activation_dropout": args["activation_dropout"],  # 激活函数的dropout率
        "architectures": ["BioGptForCausalLM"],  # 模型架构，这里使用了单个架构
        "attention_probs_dropout_prob": args["attention_dropout"],  # 注意力机制中的dropout率
        "bos_token_id": 0,  # 起始标记的token id
        "eos_token_id": 2,  # 结束标记的token id
        "hidden_act": args["activation_fn"],  # 隐藏层激活函数类型
        "hidden_dropout_prob": args["dropout"],  # 隐藏层的dropout率
        "hidden_size": args["decoder_embed_dim"],  # 隐藏层的维度大小
        "initializer_range": 0.02,  # 初始化范围
        "intermediate_size": args["decoder_ffn_embed_dim"],  # 中间层的维度大小
        "layer_norm_eps": 1e-12,  # Layer Normalization的epsilon参数
        "layerdrop": args["decoder_layerdrop"],  # 层级dropout率
        "max_position_embeddings": args["max_target_positions"],  # 最大位置嵌入长度
        "model_type": "biogpt",  # 模型类型
        "num_attention_heads": args["decoder_attention_heads"],  # 注意力头的数量
        "num_hidden_layers": args["decoder_layers"],  # 隐藏层的数量
        "pad_token_id": 1,  # 填充标记的token id
        "scale_embedding": not args["no_scale_embedding"],  # 是否缩放嵌入
        "tie_word_embeddings": args["share_decoder_input_output_embed"],  # 是否共享解码器输入输出的嵌入
        "vocab_size": src_vocab_size,  # 词汇表大小
    }
    
    # 打印消息，指示正在生成biogpt模型的配置文件
    print(f"Generating {biogpt_model_config_file}")
    # 将模型配置写入JSON文件
    with open(biogpt_model_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent))
    
    # tokenizer配置
    biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE)
    
    tokenizer_conf = {
        "bos_token": "<s>",  # 起始标记
        "eos_token": "</s>",  # 结束标记
        "model_max_length": 1024,  # 模型的最大长度
        "pad_token": "<pad>",  # 填充标记
        "special_tokens_map_file": None,  # 特殊标记映射文件
        "tokenizer_class": "BioGptTokenizer",  # tokenizer类名
        "unk_token": "<unk>",  # 未知标记
    }
    
    # 打印消息，指示正在生成biogpt tokenizer的配置文件
    print(f"Generating {biogpt_tokenizer_config_file}")
    # 将tokenizer配置写入JSON文件
    with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f:
        f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent))
    
    # 模型状态字典
    model_state_dict = chkpt["model"]
    
    # 移除不需要的键
    ignore_keys = [
        "decoder.version",
    ]
    for k in ignore_keys:
        model_state_dict.pop(k, None)
    
    # 获取所有层的名称
    layer_names = list(model_state_dict.keys())
    for layer_name in layer_names:
        if layer_name.endswith("output_projection.weight"):
            # 将decoder结尾的层名称替换为biogpt
            model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name)
        else:
            # 将decoder替换为biogpt
            model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name)
    
    # 从预训练文件夹加载配置
    config = BioGptConfig.from_pretrained(pytorch_dump_folder_path)
    # 创建新的BioGptForCausalLM模型
    model_new = BioGptForCausalLM(config)
    
    # 检查模型加载是否成功
    model_new.load_state_dict(model_state_dict)
    
    # 保存模型权重
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
    print(f"Generating {pytorch_weights_dump_path}")
    torch.save(model_state_dict, pytorch_weights_dump_path)
    
    # 打印完成消息
    print("Conversion is done!")
if __name__ == "__main__":
    # 如果当前脚本被直接执行（而非被导入为模块），则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建一个参数解析器对象

    # Required parameters
    parser.add_argument(
        "--biogpt_checkpoint_path",
        default=None,
        type=str,
        required=True,
        help=(
            "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts,"
            " bpecodes, etc."
        ),
    )
    # 添加一个必需的命令行参数，用于指定 Biogpt 的检查点文件路径

    parser.add_argument(
        "--pytorch_dump_folder_path", 
        default=None, 
        type=str, 
        required=True, 
        help="Path to the output PyTorch model."
    )
    # 添加另一个必需的命令行参数，用于指定 PyTorch 模型的输出文件夹路径

    args = parser.parse_args()
    # 解析命令行参数，并将其存储在 args 对象中

    convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path)
    # 调用函数 convert_biogpt_checkpoint_to_pytorch，传递 Biogpt 检查点路径和 PyTorch 模型输出路径作为参数

`.\models\biogpt\modeling_biogpt.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Team and Microsoft Research AI4Science All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch BioGPT model."""


import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    SequenceClassifierOutputWithPast,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
)
from .configuration_biogpt import BioGptConfig


logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "microsoft/biogpt"
_CONFIG_FOR_DOC = "BioGptConfig"


BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/biogpt",
    "microsoft/BioGPT-Large",
    # See all BioGPT models at https://huggingface.co/models?filter=biogpt
]


# Copied from transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding with OPT->BioGpt
class BioGptLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        # BioGpt is set up so that if padding_idx is specified then offset the embedding ids by 2
        # and adjust num_embeddings appropriately. Other models don't have this hack
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
        attention_mask = attention_mask.long()

        # create positions depending on attention_mask
        positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1

        # cut positions if `past_key_values_length` is > 0
        positions = positions[:, past_key_values_length:]

        return super().forward(positions + self.offset)


# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->BioGpt
class BioGptAttention(nn.Module):
    """
    Placeholder for the BioGPT Attention module.
    This class will define the attention mechanism for BioGPT.
    Actual implementation details will be filled in later.
    """
    
    # Placeholder for attention module, actual implementation details pending.
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    # 初始化函数，定义多头注意力模型的参数和层
    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[BioGptConfig] = None,
    ):
        super().__init__()  # 调用父类的初始化函数
        self.embed_dim = embed_dim  # 设置嵌入维度
        self.num_heads = num_heads  # 设置注意力头的数量
        self.dropout = dropout  # 设置dropout比例
        self.head_dim = embed_dim // num_heads  # 计算每个头的维度
        self.config = config  # 设置配置参数

        if (self.head_dim * num_heads) != self.embed_dim:
            # 检查嵌入维度是否能被注意力头数整除
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放因子，用于调整注意力分数的大小
        self.is_decoder = is_decoder  # 是否为解码器
        self.is_causal = is_causal  # 是否使用因果注意力

        # 初始化四个线性投影层，用于对输入进行线性变换
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    # 将输入张量重塑为适合多头注意力计算的形状
    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    # 前向传播函数，实现多头注意力的计算过程
    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
# 定义一个名为 BioGptDecoderLayer 的自定义神经网络层，继承自 nn.Module
class BioGptDecoderLayer(nn.Module):
    # 初始化函数，接受一个名为 config 的 BioGptConfig 类型参数
    def __init__(self, config: BioGptConfig):
        # 调用父类 nn.Module 的初始化函数
        super().__init__()
        # 设置隐藏层大小为配置中的 hidden_size
        self.embed_dim = config.hidden_size

        # 创建一个名为 self_attn 的 BioGptAttention 实例
        self.self_attn = BioGptAttention(
            embed_dim=self.embed_dim,
            num_heads=config.num_attention_heads,
            dropout=config.attention_probs_dropout_prob,
            is_decoder=True,
        )

        # 设置隐藏层的 dropout 概率为配置中的 hidden_dropout_prob
        self.dropout = config.hidden_dropout_prob
        # 根据配置中的 hidden_act 选择激活函数，并赋值给 activation_fn
        self.activation_fn = ACT2FN[config.hidden_act]
        # 设置激活函数的 dropout 概率为配置中的 activation_dropout
        self.activation_dropout = config.activation_dropout

        # 创建一个具有 LayerNorm 的自注意力层，输入维度为 embed_dim
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)

        # 创建一个线性层，输入维度为 embed_dim，输出维度为 intermediate_size
        self.fc1 = nn.Linear(self.embed_dim, config.intermediate_size)
        # 创建一个线性层，输入维度为 intermediate_size，输出维度为 embed_dim
        self.fc2 = nn.Linear(config.intermediate_size, self.embed_dim)
        # 创建一个具有 LayerNorm 的最终层，输入维度为 embed_dim
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    # 前向传播函数，接受多个输入参数，并返回计算结果
    def forward(
        self,
        hidden_states: torch.Tensor,  # 隐藏状态的张量输入
        attention_mask: Optional[torch.Tensor] = None,  # 可选的注意力掩码张量输入
        layer_head_mask: Optional[torch.Tensor] = None,  # 可选的层头掩码张量输入
        past_key_value: Optional[Tuple[torch.Tensor]] = None,  # 可选的过去键值元组输入
        output_attentions: Optional[bool] = False,  # 是否输出注意力权重，默认为 False
        use_cache: Optional[bool] = True,  # 是否使用缓存，默认为 True
        ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
        """
        # 保留输入的原始状态，用于残差连接
        residual = hidden_states

        # 对输入的 hidden_states 进行 layer normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # Self Attention
        # 如果有过去的 key/value 缓存，则提取前两个位置的缓存，否则为 None
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 执行 self-attention 操作，返回更新后的 hidden_states、self attention 权重和当前的 key/value 缓存
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            past_key_value=self_attn_past_key_value,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 对更新后的 hidden_states 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差与更新后的 hidden_states 相加，实现残差连接
        hidden_states = residual + hidden_states

        # Fully Connected
        # 保留输入的原始状态，用于残差连接
        residual = hidden_states
        # 对输入的 hidden_states 进行 layer normalization
        hidden_states = self.final_layer_norm(hidden_states)
        # 执行第一个全连接层的操作
        hidden_states = self.fc1(hidden_states)
        # 应用激活函数
        hidden_states = self.activation_fn(hidden_states)
        # 对更新后的 hidden_states 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 执行第二个全连接层的操作
        hidden_states = self.fc2(hidden_states)
        # 对更新后的 hidden_states 应用 dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差与更新后的 hidden_states 相加，实现残差连接
        hidden_states = residual + hidden_states

        # 构建输出元组
        outputs = (hidden_states,)

        # 如果需要输出 attentions，则将 self attention 的权重添加到输出中
        if output_attentions:
            outputs += (self_attn_weights,)

        # 如果需要使用缓存，则将当前的 key/value 缓存添加到输出中
        if use_cache:
            outputs += (present_key_value,)

        # 返回最终的输出元组
        return outputs
class BioGptPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 指定配置类
    config_class = BioGptConfig
    # 模型名前缀
    base_model_prefix = "biogpt"
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            # 如果是线性层，使用正态分布初始化权重，偏置置零
            # 与 TF 版本稍有不同，TF 使用截断正态分布初始化
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 如果是嵌入层，使用正态分布初始化权重，特定位置索引处权重置零
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 如果是层归一化层，偏置置零，权重置为1
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


BIOGPT_START_DOCSTRING = r"""
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`~BioGptConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

BIOGPT_INPUTS_DOCSTRING = r"""
"""


@add_start_docstrings(
    "The bare BioGPT Model transformer outputting raw hidden-states without any specific head on top.",
    BIOGPT_START_DOCSTRING,
)
class BioGptModel(BioGptPreTrainedModel):
    def __init__(self, config: BioGptConfig):
        super().__init__(config)
        self.config = config
        self.layerdrop = config.layerdrop
        self.dropout = config.hidden_dropout_prob
        self.embed_dim = config.hidden_size
        self.padding_idx = config.pad_token_id
        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0

        # 嵌入层：词汇量大小为 config.vocab_size，嵌入维度为 self.embed_dim，使用 padding_idx 进行填充
        self.embed_tokens = nn.Embedding(config.vocab_size, self.embed_dim, self.padding_idx)
        # 学习到的位置嵌入：最大位置嵌入数为 config.max_position_embeddings，嵌入维度为 self.embed_dim
        self.embed_positions = BioGptLearnedPositionalEmbedding(config.max_position_embeddings, self.embed_dim)

        # 层列表：包含 config.num_hidden_layers 个 BioGptDecoderLayer 层
        self.layers = nn.ModuleList([BioGptDecoderLayer(config) for _ in range(config.num_hidden_layers)])
        # 层归一化层：输入维度为 self.embed_dim
        self.layer_norm = nn.LayerNorm(self.embed_dim)

        self.gradient_checkpointing = False
        # 初始化权重并进行最终处理
        self.post_init()

    def get_input_embeddings(self):
        return self.embed_tokens

    def set_input_embeddings(self, value):
        self.embed_tokens = value
    # 使用装饰器将下面的函数添加文档字符串，文档字符串包含有关输入参数的信息
    @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 使用装饰器添加代码示例文档字符串，指定模型的检查点、输出类型、配置类等信息
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPastAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 定义模型的前向传播函数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs 张量，可以为 None
        attention_mask: Optional[torch.FloatTensor] = None,  # 注意力遮罩张量，可以为 None
        head_mask: Optional[torch.FloatTensor] = None,  # 头部遮罩张量，可以为 None
        inputs_embeds: Optional[torch.FloatTensor] = None,  # 嵌入输入张量，可以为 None
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,  # 过去的键值对，可以为 None
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以为 None
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以为 None
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以为 None
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可以为 None
# 为 BioGPT 模型添加文档字符串，说明其具有顶部的语言建模头用于 CLM 微调
@add_start_docstrings(
    """BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING
)
# 定义 BioGptForCausalLM 类，继承自 BioGptPreTrainedModel 类
class BioGptForCausalLM(BioGptPreTrainedModel):
    # 定义权重绑定的键值列表
    _tied_weights_keys = ["output_projection.weight"]

    # 初始化函数，接收一个配置对象 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)

        # 创建一个 BioGptModel 实例，并赋值给 self.biogpt
        self.biogpt = BioGptModel(config)
        # 创建一个线性层，用于将隐藏状态映射到词汇表大小的输出
        self.output_projection = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    # 获取输出词嵌入的方法
    def get_output_embeddings(self):
        return self.output_projection

    # 设置输出词嵌入的方法
    def set_output_embeddings(self, new_embeddings):
        self.output_projection = new_embeddings

    # 为 forward 方法添加文档字符串，描述输入参数的作用，使用给定的模板
    @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 添加代码示例的文档字符串，指定检查点、输出类型和配置类
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播方法，接收多个可选的输入参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 根据需要确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 将输入传递给预训练模型并获取输出
        outputs = self.biogpt(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            past_key_values=past_key_values,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从模型输出中提取序列输出（即预测的序列）
        sequence_output = outputs[0]
        # 将序列输出投影到预测分数（logits）空间
        prediction_scores = self.output_projection(sequence_output)

        lm_loss = None
        if labels is not None:
            # 如果有提供标签，计算语言建模的损失
            # 预测的分数向左移动一位，以便进行下一个标记的预测
            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
            labels = labels[:, 1:].contiguous()
            # 使用交叉熵损失函数计算损失
            loss_fct = CrossEntropyLoss()
            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            # 如果不需要返回字典，构建输出元组
            output = (prediction_scores,) + outputs[1:]
            return ((lm_loss,) + output) if lm_loss is not None else output

        # 如果需要返回字典，构建包含附加信息的对象
        return CausalLMOutputWithCrossAttentions(
            loss=lm_loss,
            logits=prediction_scores,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )
        # 如果 past_key_values 参数不为 None，则根据定义的行为保留输入的最后几个 token
        if past_key_values is not None:
            # 获取第一个 past_key_values 的第一个元素的形状的第三个维度长度
            past_length = past_key_values[0][0].shape[2]

            # 如果输入的 input_ids 长度大于 past_length，则保留最后 past_length 个 token
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 否则，默认保留最后一个 token
                remove_prefix_length = input_ids.shape[1] - 1

            # 更新 input_ids，仅保留所需的部分
            input_ids = input_ids[:, remove_prefix_length:]

        # 如果 inputs_embeds 不为 None 且 past_key_values 为 None，则将其作为模型输入
        if inputs_embeds is not None and past_key_values is None:
            model_inputs = {"inputs_embeds": inputs_embeds}
        else:
            # 否则，默认使用 input_ids 作为模型输入
            model_inputs = {"input_ids": input_ids}

        # 更新 model_inputs 字典，添加 attention_mask、past_key_values 和 use_cache 参数
        model_inputs.update(
            {
                "attention_mask": attention_mask,
                "past_key_values": past_key_values,
                "use_cache": kwargs.get("use_cache"),
            }
        )

        # 返回组装好的模型输入
        return model_inputs

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        # 重新排序 past_key_values 中的数据，根据给定的 beam_idx
        reordered_past = ()
        for layer_past in past_key_values:
            # 对每层的 past_state 执行重新排序操作，根据 beam_idx
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        # 返回重新排序后的 past_key_values
        return reordered_past
# 使用装饰器为类添加文档字符串，描述了这是一个在 BioGPT 模型基础上增加了标记分类头的模型，用于命名实体识别（NER）任务
@add_start_docstrings(
    """
    BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    """,
    BIOGPT_START_DOCSTRING,
)
# 定义 BioGptForTokenClassification 类，继承自 BioGptPreTrainedModel
class BioGptForTokenClassification(BioGptPreTrainedModel):
    # 初始化函数，接受一个配置参数 config
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__(config)
        # 设置类别数量为配置中的 num_labels
        self.num_labels = config.num_labels

        # 创建一个 BioGptModel 对象，并将其赋值给 self.biogpt
        self.biogpt = BioGptModel(config)

        # 检查配置中是否有 classifier_dropout 属性，并根据其值设置分类器的 dropout
        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
            classifier_dropout = config.classifier_dropout
        else:
            # 否则使用配置中的 hidden_dropout_prob 作为 dropout
            classifier_dropout = config.hidden_dropout_prob
        # 创建一个 Dropout 层，用于模型训练中的随机失活
        self.dropout = nn.Dropout(classifier_dropout)

        # 创建一个全连接层，将隐藏状态的输出映射到 num_labels 大小的向量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 调用模型初始化后的处理函数，用于进一步初始化工作
        self.post_init()

    # 使用装饰器为 forward 方法添加文档字符串，描述了其输入参数和输出
    @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    # 前向传播方法定义，接受多个输入参数和返回值
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,

        # 前向传播方法的具体实现部分尚未提供，在此处省略
        pass


这段代码定义了一个 `BioGptForTokenClassification` 类，它是在 `BioGptPreTrainedModel` 基础上构建的，用于处理标记分类任务，例如命名实体识别（NER）。类中的 `forward` 方法尚未具体实现前向传播逻辑，但通过装饰器和注释详细描述了输入参数和输出，以及一些样例和模型配置的文档。
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 return_dict 不是 None，则使用传入的 return_dict 值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 biogpt 模型进行推断
        transformer_outputs = self.biogpt(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的 hidden states
        hidden_states = transformer_outputs[0]
        # 对 hidden states 进行 dropout 处理
        hidden_states = self.dropout(hidden_states)
        # 使用分类器得到 logits
        logits = self.classifier(hidden_states)

        loss = None
        # 如果提供了 labels，则计算损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 只保留 attention_mask 中激活部分的损失
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                # 使用 active_loss 选择性地处理 labels
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                # 计算损失
                loss = loss_fct(active_logits, active_labels)
            else:
                # 计算整体的损失
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不需要返回字典形式的输出，则返回一个元组
        if not return_dict:
            output = (logits,) + transformer_outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回 TokenClassifierOutput 对象，包括损失、logits、hidden states 和 attentions
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
# 使用装饰器为类添加文档字符串，描述了 BioGptForSequenceClassification 模型的作用和工作原理
@add_start_docstrings(
    """
    The BioGpt Model transformer with a sequence classification head on top (linear layer).

    [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it is required to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    """,
    BIOGPT_START_DOCSTRING,
)
class BioGptForSequenceClassification(BioGptPreTrainedModel):
    def __init__(self, config: BioGptConfig):
        # 调用父类构造函数初始化模型配置
        super().__init__(config)
        # 从配置中获取类别数量
        self.num_labels = config.num_labels
        # 初始化 BioGptModel 模型
        self.biogpt = BioGptModel(config)
        # 使用线性层进行分类，输出维度为隐藏层大小到类别数量的映射
        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)

        # 初始化权重并进行最终处理
        self.post_init()

    # 使用装饰器为 forward 方法添加文档字符串，描述了输入参数和输出的详细说明
    @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutputWithPast,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 实现模型的前向传播逻辑，具体细节通过装饰器文档字符串提供

    # 获取输入的嵌入层（embeddings）
    def get_input_embeddings(self):
        return self.biogpt.embed_tokens

    # 设置输入的嵌入层（embeddings）
    def set_input_embeddings(self, value):
        self.biogpt.embed_tokens = value

`.\models\biogpt\tokenization_biogpt.py`

# coding=utf-8
# 设定文件编码为 UTF-8

# Copyright 2022 The HuggingFace Team and Microsoft Research AI4Science. All rights reserved.
# 版权声明，版权归属于 HuggingFace Team 和 Microsoft Research AI4Science，保留所有权利。

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可证授权，除非符合许可证，否则不得使用此文件。

# you may not use this file except in compliance with the License.
# 除非符合许可证，否则不得使用此文件。

# You may obtain a copy of the License at
# 可以在以下网址获取许可证副本
# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则软件

# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 按“原样”分发，无论是明示还是暗示的，不附带任何形式的担保或条件。

# See the License for the specific language governing permissions and
# 查看许可证获取特定语言的权限

# limitations under the License.
# 许可证下的限制。

"""Tokenization classes for BioGPT."""
# 用于 BioGPT 的分词类

import json
# 导入 json 库
import os
# 导入 os 库
from typing import List, Optional, Tuple
# 导入类型提示，List、Optional 和 Tuple

from ...tokenization_utils import PreTrainedTokenizer
# 从 tokenization_utils 模块中导入 PreTrainedTokenizer 类
from ...utils import logging
# 从 utils 模块中导入 logging 模块

logger = logging.get_logger(__name__)
# 获取当前模块的 logger 对象

VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",
    "merges_file": "merges.txt",
}
# 定义词汇文件和合并文件的名称映射字典

PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "microsoft/biogpt": "https://huggingface.co/microsoft/biogpt/resolve/main/vocab.json",
    },
    "merges_file": {"microsoft/biogpt": "https://huggingface.co/microsoft/biogpt/resolve/main/merges.txt"},
}
# 预训练模型的词汇文件和合并文件的映射字典，指定了 Microsoft 的 BioGPT 模型的文件位置

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "microsoft/biogpt": 1024,
}
# 预训练模型的位置嵌入尺寸字典，指定了 Microsoft 的 BioGPT 模型的位置嵌入大小为 1024


def get_pairs(word):
    """
    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
    strings)
    """
    # 返回单词中的符号对集合，单词表示为符号元组（符号是可变长度字符串）
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs
    # 遍历单词中的字符，将相邻的字符作为一个对加入到集合中，并返回最终的符号对集合


class BioGptTokenizer(PreTrainedTokenizer):
    """
    Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.
    """
    # 构建一个 FAIRSEQ Transformer 分词器，使用 Moses 分词后接字节对编码

    def __init__(
        self,
        vocab_file,
        merges_file,
        unk_token="<unk>",
        eos_token="</s>",
        pad_token="<pad>",
        **kwargs
    ):
        # 初始化方法，接收词汇文件、合并文件以及其他参数
        super().__init__(
            unk_token=unk_token,
            eos_token=eos_token,
            pad_token=pad_token,
            **kwargs
        )
        # 调用父类的初始化方法，设置未知符号、结束符号和填充符号

        self.vocab_file = vocab_file
        self.merges_file = merges_file
        # 设置词汇文件和合并文件属性

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        # 构建包含特殊符号的输入方法，用于处理包含两个序列的输入
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
        adding special tokens.
        """
        # 从序列或序列对构建模型输入，用于序列分类任务，通过连接和添加特殊符号

        if token_ids_1 is None:
            return token_ids_0
        else:
            return token_ids_0 + token_ids_1
        # 如果只有一个序列，则直接返回该序列；如果有两个序列，则连接它们后返回

    def get_vocab(self):
        # 获取词汇表方法
        with open(self.vocab_file, "r", encoding="utf-8") as reader:
            vocab = json.load(reader)
        return vocab
        # 打开词汇文件，加载词汇表并返回

    def tokenize(self, text):
        # 分词方法
        return text.split()
        # 使用空格分割文本并返回分词结果

    def convert_tokens_to_ids(self, tokens):
        # 将分词转换为 ID 方法
        vocab = self.get_vocab()
        return [vocab[token] for token in tokens]
        # 获取词汇表，将分词列表中的每个分词转换为对应的 ID 并返回

    def convert_ids_to_tokens(self, ids):
        # 将 ID 转换为分词方法
        vocab = self.get_vocab()
        return [list(vocab.keys())[list(vocab.values()).index(id)] for id in ids]
        # 获取词汇表，将 ID 列表中的每个 ID 转换为对应的分词并返回
    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
            <Tip>
            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.
            </Tip>
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
            <Tip>
            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.
            </Tip>
        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
    """

    # 确定词汇文件名的键名
    vocab_files_names = VOCAB_FILES_NAMES
    # 预训练模型的词汇文件映射表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    # 预训练位置嵌入的最大模型输入大小
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 模型输入名称列表
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(
        self,
        vocab_file,
        merges_file,
        unk_token="<unk>",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        pad_token="<pad>",
        **kwargs,
    ):
        try:
            import sacremoses  # 尝试导入 sacremoses 库
        except ImportError:
            # 如果导入失败，抛出 ImportError 异常，提醒需要安装 sacremoses 库
            raise ImportError(
                "You need to install sacremoses to use BioGptTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        self.lang = "en"
        self.sm = sacremoses  # 将 sacremoses 赋值给 self.sm
        # 缓存 sm.MosesTokenizer 实例
        self.cache_moses_tokenizer = {}
        self.cache_moses_detokenizer = {}

        """ Initialisation"""
        # 用 utf-8 编码打开 vocab_file 文件，并将其内容加载为 JSON 格式，赋值给 self.encoder
        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)
        # 创建 self.decoder 字典，将 self.encoder 的键值对反转
        self.decoder = {v: k for k, v in self.encoder.items()}
        # 用 utf-8 编码打开 merges_file 文件，读取内容并按行分割，去除最后一个空行，赋值给 merges 列表
        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[:-1]
        # 将 merges 列表中的每个元素按空格分割成元组，构成 merges 列表
        merges = [tuple(merge.split()[:2]) for merge in merges]
        # 创建 self.bpe_ranks 字典，将 merges 列表中的元素与其在列表中的索引号配对
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {}

        # 调用父类的初始化方法，并传入指定参数
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            unk_token=unk_token,
            pad_token=pad_token,
            **kwargs,
        )

    @property
    def vocab_size(self):
        """Returns vocab size"""
        # 返回 self.encoder 的长度，即词汇表的大小
        return len(self.encoder)

    def get_vocab(self):
        # 返回包含 self.encoder 和 self.added_tokens_encoder 的字典
        return dict(self.encoder, **self.added_tokens_encoder)

    def moses_tokenize(self, text, lang):
        # 如果 lang 不在 self.cache_moses_tokenizer 中，则创建一个新的 sm.MosesTokenizer 实例并缓存
        if lang not in self.cache_moses_tokenizer:
            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
            self.cache_moses_tokenizer[lang] = moses_tokenizer
        # 使用缓存的 moses_tokenizer 对象对文本进行分词处理并返回结果
        return self.cache_moses_tokenizer[lang].tokenize(
            text, aggressive_dash_splits=True, return_str=False, escape=True
        )

    def moses_detokenize(self, tokens, lang):
        # 如果 lang 不在 self.cache_moses_detokenizer 中，则创建一个新的 sm.MosesDetokenizer 实例并缓存
        if lang not in self.cache_moses_detokenizer:
            moses_detokenizer = self.sm.MosesDetokenizer(lang=lang)
            self.cache_moses_detokenizer[lang] = moses_detokenizer
        # 使用缓存的 moses_detokenizer 对象对 tokens 进行反向处理并返回结果
        return self.cache_moses_detokenizer[lang].detokenize(tokens)
    def bpe(self, token):
        # 将输入的token进行BPE处理，生成新的词形
        word = tuple(token[:-1]) + (token[-1] + "</w>",)
        # 如果token已经被处理过，直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        # 获取token中的所有字符对
        pairs = get_pairs(word)

        # 如果没有字符对，直接返回token后加上结束符的形式
        if not pairs:
            return token + "</w>"

        # 循环处理字符对，直到无法继续合并
        while True:
            # 找到优先级最低的字符对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果这个字符对不在BPE词频表中，停止合并
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    new_word.extend(word[i:])
                    break
                else:
                    new_word.extend(word[i:j])
                    i = j

                # 合并找到的字符对
                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            # 如果只剩一个词元，停止合并
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        
        # 将词元列表转换为字符串形式返回
        word = " ".join(word)
        # 处理特殊情况下的结束符
        if word == "\n  </w>":
            word = "\n</w>"
        # 将处理后的结果缓存起来
        self.cache[token] = word
        return word

    def _tokenize(self, text, bypass_tokenizer=False):
        """Returns a tokenized string."""
        # 根据bypass_tokenizer的值选择是否使用分词器进行处理文本
        if bypass_tokenizer:
            text = text.split()
        else:
            text = self.moses_tokenize(text, self.lang)

        split_tokens = []
        # 对文本中的每个token进行BPE处理并拆分结果
        for token in text:
            if token:
                split_tokens.extend(list(self.bpe(token).split(" ")))

        return split_tokens

    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        # 根据词汇表将token转换为对应的ID，未知token返回未知token的ID
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        # 根据词汇表将ID转换为对应的token，未知ID返回未知token
        return self.decoder.get(index, self.unk_token)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        # 移除BPE处理过程中的空格和结束符，将tokens拼接成文本
        tokens = [t.replace(" ", "").replace("</w>", " ") for t in tokens]
        tokens = "".join(tokens).split()
        # 使用Moses库的反分词函数，将tokens还原为文本
        text = self.moses_detokenize(tokens, self.lang)
        return text

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
        ):
        # 构建输入序列，添加特殊token以及可能的第二个序列的token
    def build_model_inputs(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BioGPT sequence has the following format:

        - single sequence: `</s> X `
        - pair of sequences: `</s> A </s> B `

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # If only one sequence provided, return with a single separator token added at the beginning
        if token_ids_1 is None:
            return [self.sep_token_id] + token_ids_0
        # If two sequences provided, construct the input with a separator token between and at the ends of each sequence
        sep = [self.sep_token_id]
        return sep + token_ids_0 + sep + token_ids_1

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        # If tokens already have special tokens, delegate to superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )
        # For sequences without special tokens, return a mask with 1s for special tokens and 0s for sequence tokens
        # no bos used in fairseq
        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
        return [1] + ([0] * len(token_ids_0))
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
        Transformer sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        sep = [self.sep_token_id]

        # 如果没有传入第二个序列的 token_ids_1，则只返回第一个部分的 mask（全为 0）
        if token_ids_1 is None:
            return len(token_ids_0 + sep) * [0]
        # 返回一个由 token_ids_0 和 token_ids_1 组成的 mask，其中 token_ids_0 部分全为 0，token_ids_1 部分全为 1
        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and merges files to the specified directory.

        Args:
            save_directory (str):
                Directory path where the vocabulary files will be saved.
            filename_prefix (str, optional):
                Prefix to be added to the filenames of vocabulary and merges files.

        Returns:
            Tuple[str]: Tuple containing the paths to the saved vocabulary and merges files.
        """
        # 如果保存目录不存在，则记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return

        # 构造保存的词汇表文件名和合并文件名的路径
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )

        # 将词汇表写入到 JSON 格式的文件中
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        index = 0
        # 将 BPE token 和其索引写入到合并文件中
        with open(merge_file, "w", encoding="utf-8") as writer:
            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
                        " Please check that the tokenizer is not corrupted!"
                    )
                    index = token_index
                writer.write(" ".join(bpe_tokens) + "\n")
                index += 1

        # 返回保存的词汇表文件和合并文件的路径
        return vocab_file, merge_file

    def __getstate__(self):
        """
        Get the state of the XLMTokenizer object for pickling.
        """
        # 复制对象的状态字典，并设置 'sm' 为 None，以便序列化时忽略 'sm'
        state = self.__dict__.copy()
        state["sm"] = None
        return state

    def __setstate__(self, d):
        """
        Set the state of the XLMTokenizer object from a dictionary.
        """
        # 从字典中恢复对象的状态
        self.__dict__ = d

        # 检查是否安装了 sacremoses 库，如果没有则抛出 ImportError
        try:
            import sacremoses
        except ImportError:
            raise ImportError(
                "You need to install sacremoses to use XLMTokenizer. "
                "See https://pypi.org/project/sacremoses/ for installation."
            )

        # 将 sacremoses 模块引入 self.sm
        self.sm = sacremoses

`.\models\biogpt\init.py`

# 引入必要的模块和函数来处理依赖性检查和延迟加载
from typing import TYPE_CHECKING

# 从工具包中导入自定义异常和延迟加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available

# 定义模块的导入结构
_import_structure = {
    "configuration_biogpt": ["BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BioGptConfig"],
    "tokenization_biogpt": ["BioGptTokenizer"],
}

# 检查是否有torch可用，如果不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用，则添加相关的模型定义到导入结构中
    _import_structure["modeling_biogpt"] = [
        "BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BioGptForCausalLM",
        "BioGptForTokenClassification",
        "BioGptForSequenceClassification",
        "BioGptModel",
        "BioGptPreTrainedModel",
    ]

# 如果是类型检查模式，则从相关模块导入类型信息
if TYPE_CHECKING:
    from .configuration_biogpt import BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, BioGptConfig
    from .tokenization_biogpt import BioGptTokenizer

    # 在torch可用的情况下，导入模型相关的类型信息
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_biogpt import (
            BIOGPT_PRETRAINED_MODEL_ARCHIVE_LIST,
            BioGptForCausalLM,
            BioGptForSequenceClassification,
            BioGptForTokenClassification,
            BioGptModel,
            BioGptPreTrainedModel,
        )

# 如果不是类型检查模式，则设置模块为延迟加载模式
else:
    import sys

    # 设置当前模块为延迟加载模式，通过_LazyModule类实现
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\bit\configuration_bit.py`

# coding=utf-8
# 上面的行指定了文件的编码格式为 UTF-8，确保文件中的中文等特殊字符能正确解析
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 版权声明，标明此文件版权归 HuggingFace 公司所有，保留所有权利
#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache 2.0 许可证授权，允许在符合许可证条件的情况下使用本文件
# you may not use this file except in compliance with the License.
# 除非遵守许可证规定，否则不得使用本文件
# You may obtain a copy of the License at
# 可以在以下网址获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# 除非适用法律要求或书面同意，否则在法律允许的范围内，软件
# distributed under the License is distributed on an "AS IS" BASIS,
# 根据许可证以“原样”分发
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 没有任何形式的明示或暗示保证和条件
# See the License for the specific language governing permissions and
# 请查看许可证了解特定语言的权限和限制
# limitations under the License.
""" BiT model configuration"""
# BiT 模型配置

from ...configuration_utils import PretrainedConfig
# 导入预训练配置类
from ...utils import logging
# 导入日志工具
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
# 导入 Backbone 配置混合类和获取对齐输出特征输出索引的工具函数


logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器

BIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "google/bit-50": "https://huggingface.co/google/bit-50/resolve/main/config.json",
}
# BiT 预训练模型的配置映射，将模型名称映射到配置文件的 URL
# 当需要获取特定模型的配置时，可以通过模型名称在这里查找对应的配置 URL


class BitConfig(BackboneConfigMixin, PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the BiT
    [google/bit-50](https://huggingface.co/google/bit-50) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # BitConfig 类，用于存储 BitModel 的配置信息
    # 继承自 BackboneConfigMixin 和 PretrainedConfig，用于控制模型输出
    # 模型类型设定为 "bit"
    model_type = "bit"
    # 定义神经网络层类型列表，包括“preactivation”和“bottleneck”
    layer_types = ["preactivation", "bottleneck"]
    # 定义支持的填充策略列表，包括“SAME”和“VALID”
    supported_padding = ["SAME", "VALID"]

    # 初始化函数，用于设置神经网络模型的各种参数和属性
    def __init__(
        self,
        num_channels=3,
        embedding_size=64,
        hidden_sizes=[256, 512, 1024, 2048],
        depths=[3, 4, 6, 3],
        layer_type="preactivation",  # 神经网络层类型，默认为“preactivation”
        hidden_act="relu",  # 隐藏层激活函数，默认为ReLU
        global_padding=None,  # 全局填充策略，默认为None
        num_groups=32,  # 分组数，默认为32
        drop_path_rate=0.0,  # DropPath率，默认为0.0
        embedding_dynamic_padding=False,  # 是否使用动态填充embedding，默认为False
        output_stride=32,  # 输出步幅，默认为32
        width_factor=1,  # 宽度因子，默认为1
        out_features=None,
        out_indices=None,
        **kwargs,
    ):
        super().__init__(**kwargs)  # 调用父类的初始化方法

        # 检查传入的神经网络层类型是否在支持的层类型列表中，否则抛出数值错误
        if layer_type not in self.layer_types:
            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
        
        # 如果全局填充策略不为None，则检查其是否在支持的填充策略列表中，否则抛出数值错误
        if global_padding is not None:
            if global_padding.upper() in self.supported_padding:
                global_padding = global_padding.upper()
            else:
                raise ValueError(f"Padding strategy {global_padding} not supported")
        
        # 将所有传入的参数赋值给对象的属性
        self.num_channels = num_channels
        self.embedding_size = embedding_size
        self.hidden_sizes = hidden_sizes
        self.depths = depths
        self.layer_type = layer_type
        self.hidden_act = hidden_act
        self.global_padding = global_padding
        self.num_groups = num_groups
        self.drop_path_rate = drop_path_rate
        self.embedding_dynamic_padding = embedding_dynamic_padding
        self.output_stride = output_stride
        self.width_factor = width_factor

        # 设置阶段（stage）名称列表，包括“stem”和“stage1”到“stage4”（或更多阶段）
        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
        
        # 调用函数获取对齐的输出特征和输出索引，用于后续计算
        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
        )

`.\models\bit\convert_bit_to_pytorch.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert BiT checkpoints from the timm library."""

import argparse  # 导入命令行参数解析模块
import json  # 导入处理 JSON 格式数据的模块
from pathlib import Path  # 导入处理文件路径的模块

import requests  # 导入处理 HTTP 请求的模块
import torch  # 导入 PyTorch 深度学习库
from huggingface_hub import hf_hub_download  # 导入从 Hugging Face Hub 下载资源的函数
from PIL import Image  # 导入处理图像的模块
from timm import create_model  # 从 timm 库中导入创建模型的函数
from timm.data import resolve_data_config  # 从 timm 库中导入解析数据配置的函数
from timm.data.transforms_factory import create_transform  # 从 timm 库中导入创建数据转换的函数

from transformers import BitConfig, BitForImageClassification, BitImageProcessor  # 导入 BiT 模型相关类
from transformers.image_utils import PILImageResampling  # 导入图像处理相关函数
from transformers.utils import logging  # 导入日志记录相关函数


logging.set_verbosity_info()  # 设置日志记录的详细级别为信息
logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器


def get_config(model_name):
    repo_id = "huggingface/label-files"
    filename = "imagenet-1k-id2label.json"
    id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
    id2label = {int(k): v for k, v in id2label.items()}
    label2id = {v: k for k, v in id2label.items()}

    conv_layer = "std_conv" if "bit" in model_name else False

    # note that when using BiT as backbone for ViT-hybrid checkpoints,
    # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same",
    # config.conv_layer = "std_conv_same"
    config = BitConfig(
        conv_layer=conv_layer,
        num_labels=1000,
        id2label=id2label,
        label2id=label2id,
    )

    return config


def rename_key(name):
    if "stem.conv" in name:
        name = name.replace("stem.conv", "bit.embedder.convolution")
    if "blocks" in name:
        name = name.replace("blocks", "layers")
    if "head.fc" in name:
        name = name.replace("head.fc", "classifier.1")
    if name.startswith("norm"):
        name = "bit." + name
    if "bit" not in name and "classifier" not in name:
        name = "bit.encoder." + name

    return name


# We will verify our results on an image of cute cats
def prepare_img():
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    im = Image.open(requests.get(url, stream=True).raw)
    return im


@torch.no_grad()
def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False):
    """
    Copy/paste/tweak model's weights to our BiT structure.
    """

    # define default BiT configuration
    config = get_config(model_name)  # 获取指定模型的配置信息

    # load original model from timm
    timm_model = create_model(model_name, pretrained=True)  # 从 timm 加载预训练模型
    timm_model.eval()  # 设置模型为评估模式，即不进行梯度计算和反向传播
    # 获取原始模型的状态字典
    state_dict = timm_model.state_dict()
    # 遍历状态字典的键（这里使用副本），更新键名并挤压张量（如果键名中包含 "head"）
    for key in state_dict.copy().keys():
        val = state_dict.pop(key)
        state_dict[rename_key(key)] = val.squeeze() if "head" in key else val

    # 加载 HuggingFace 模型
    model = BitForImageClassification(config)
    # 设置模型为评估模式
    model.eval()
    # 加载预训练好的模型参数
    model.load_state_dict(state_dict)

    # 创建图像处理器
    transform = create_transform(**resolve_data_config({}, model=timm_model))
    # 获取 timm_transforms 列表
    timm_transforms = transform.transforms

    # 定义 Pillow 图像重采样方法的映射关系
    pillow_resamplings = {
        "bilinear": PILImageResampling.BILINEAR,
        "bicubic": PILImageResampling.BICUBIC,
        "nearest": PILImageResampling.NEAREST,
    }

    # 创建 BitImageProcessor 实例
    processor = BitImageProcessor(
        do_resize=True,  # 是否执行调整大小操作
        size={"shortest_edge": timm_transforms[0].size},  # 调整大小后的最短边长度
        resample=pillow_resamplings[timm_transforms[0].interpolation.value],  # 图像重采样方法
        do_center_crop=True,  # 是否执行中心裁剪
        crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]},  # 裁剪尺寸
        do_normalize=True,  # 是否执行归一化
        image_mean=timm_transforms[-1].mean.tolist(),  # 图像均值
        image_std=timm_transforms[-1].std.tolist(),  # 图像标准差
    )

    # 准备图像
    image = prepare_img()
    # 对图像应用变换并扩展维度
    timm_pixel_values = transform(image).unsqueeze(0)
    # 使用图像处理器处理图像并获取像素值
    pixel_values = processor(image, return_tensors="pt").pixel_values

    # 验证像素值是否一致
    assert torch.allclose(timm_pixel_values, pixel_values)

    # 验证输出的 logits
    with torch.no_grad():
        outputs = model(pixel_values)
        logits = outputs.logits

    # 打印前三个 logits 值
    print("Logits:", logits[0, :3])
    # 打印预测类别
    print("Predicted class:", model.config.id2label[logits.argmax(-1).item()])
    # 使用 timm_model 计算 logits
    timm_logits = timm_model(pixel_values)
    # 断言 timm_logits 的形状与 outputs.logits 相同
    assert timm_logits.shape == outputs.logits.shape
    # 断言 timm_logits 与 outputs.logits 的值在容差范围内相等
    assert torch.allclose(timm_logits, outputs.logits, atol=1e-3)
    # 打印确认信息
    print("Looks ok!")

    # 如果指定了 PyTorch 模型保存路径
    if pytorch_dump_folder_path is not None:
        # 创建路径
        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
        # 打印保存信息
        print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}")
        # 保存模型和处理器
        model.save_pretrained(pytorch_dump_folder_path)
        processor.save_pretrained(pytorch_dump_folder_path)

    # 如果要推送到 Hub
    if push_to_hub:
        # 打印推送信息
        print(f"Pushing model {model_name} and processor to the hub")
        # 推送模型到 Hub
        model.push_to_hub(f"ybelkada/{model_name}")
        # 推送处理器到 Hub
        processor.push_to_hub(f"ybelkada/{model_name}")
if __name__ == "__main__":
    # 如果脚本直接运行而非被导入，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--model_name",
        default="resnetv2_50x1_bitm",
        type=str,
        help="Name of the BiT timm model you'd like to convert.",
    )
    # 添加模型名称参数，设置默认值为'resnetv2_50x1_bitm'，类型为字符串，用于指定要转换的 BiT timm 模型名称

    parser.add_argument(
        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
    )
    # 添加参数，指定输出 PyTorch 模型的目录路径，类型为字符串，默认为None

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Whether to push the model to the hub.",
    )
    # 添加参数，用于指定是否将模型推送到 hub，采用布尔标志方式表示

    args = parser.parse_args()
    # 解析命令行参数，并将结果存储在 args 变量中

    convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
    # 调用函数 convert_bit_checkpoint，传递解析后的参数：模型名称、输出目录路径、是否推送到 hub

`.\models\bit\image_processing_bit.py`

# 导入所需的模块和库
from typing import Dict, List, Optional, Union  # 导入类型提示相关的模块

import numpy as np  # 导入NumPy库，用于处理数组和矩阵操作

# 导入图像处理相关的实用工具和函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
from ...image_transforms import (
    convert_to_rgb,                     # 导入转换为RGB格式的函数
    get_resize_output_image_size,       # 导入获取调整后图像大小的函数
    resize,                             # 导入图像调整大小的函数
    to_channel_dimension_format,        # 导入将图像转换为通道维度格式的函数
)
from ...image_utils import (
    OPENAI_CLIP_MEAN,                   # 导入OpenAI CLIP平均值
    OPENAI_CLIP_STD,                    # 导入OpenAI CLIP标准差
    ChannelDimension,                   # 导入通道维度枚举
    ImageInput,                         # 导入图像输入类型
    PILImageResampling,                 # 导入PIL图像重采样方式
    infer_channel_dimension_format,     # 推断通道维度格式的函数
    is_scaled_image,                    # 判断是否为缩放图像的函数
    make_list_of_images,                # 将图像转换为图像列表的函数
    to_numpy_array,                     # 将图像转换为NumPy数组的函数
    valid_images,                       # 检查图像是否有效的函数
    validate_kwargs,                    # 验证关键字参数的函数
    validate_preprocess_arguments,      # 验证预处理参数的函数
)
from ...utils import TensorType, is_vision_available, logging  # 导入Tensor类型，视觉库是否可用标志和日志记录功能

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 如果视觉库可用，则导入PIL模块
if is_vision_available():
    import PIL  # 导入PIL库，用于图像处理
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
            `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
            method.
        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
            `preprocess` method.
        crop_size (`Dict[str, int]` *optional*, defaults to 224):
            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
            method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize:
            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
    """
    # 模型输入的名称，这里指定了一个名称为 "pixel_values" 的输入
    model_input_names = ["pixel_values"]
    # 初始化函数，设置各种预处理参数的默认值
    def __init__(
        self,
        do_resize: bool = True,  # 是否进行调整大小
        size: Dict[str, int] = None,  # 图像大小字典
        resample: PILImageResampling = PILImageResampling.BICUBIC,  # 重采样方法
        do_center_crop: bool = True,  # 是否进行中心裁剪
        crop_size: Dict[str, int] = None,  # 裁剪大小字典
        do_rescale: bool = True,  # 是否进行重新缩放
        rescale_factor: Union[int, float] = 1 / 255,  # 重新缩放因子
        do_normalize: bool = True,  # 是否进行标准化
        image_mean: Optional[Union[float, List[float]]] = None,  # 图像均值
        image_std: Optional[Union[float, List[float]]] = None,  # 图像标准差
        do_convert_rgb: bool = True,  # 是否进行 RGB 转换
        **kwargs,
    ) -> None:
        # 调用父类初始化函数
        super().__init__(**kwargs)
        # 如果未指定图像大小，则设置默认值
        size = size if size is not None else {"shortest_edge": 224}
        # 获取图像大小字典
        size = get_size_dict(size, default_to_square=False)
        # 如果未指定裁剪大小，则设置默认值
        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
        # 获取裁剪大小字典
        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")

        # 设置各参数的值
        self.do_resize = do_resize
        self.size = size
        self.resample = resample
        self.do_center_crop = do_center_crop
        self.crop_size = crop_size
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.do_convert_rgb = do_convert_rgb
        self._valid_processor_keys = [
            "images",
            "do_resize",
            "size",
            "resample",
            "do_center_crop",
            "crop_size",
            "do_rescale",
            "rescale_factor",
            "do_normalize",
            "image_mean",
            "image_std",
            "do_convert_rgb",
            "return_tensors",
            "data_format",
            "input_data_format",
        ]

    # 从 transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize 方法复制而来
    def resize(
        self,
        image: np.ndarray,  # 输入的图像数组
        size: Dict[str, int],  # 图像大小的字典
        resample: PILImageResampling = PILImageResampling.BICUBIC,  # 重采样方法
        data_format: Optional[Union[str, ChannelDimension]] = None,  # 数据格式
        input_data_format: Optional[Union[str, ChannelDimension]] = None,  # 输入数据格式
        **kwargs,
    ) -> np.ndarray:
        """
        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
        resized to keep the input aspect ratio.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                Resampling filter to use when resizing the image.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        """
        # 默认使用正方形调整大小
        default_to_square = True
        # 如果 `size` 字典中包含 'shortest_edge' 键
        if "shortest_edge" in size:
            # 将 `size` 设为 `size["shortest_edge"]`，即最短边的长度
            size = size["shortest_edge"]
            # 关闭默认正方形调整大小选项
            default_to_square = False
        # 如果 `size` 字典中包含 'height' 和 'width' 键
        elif "height" in size and "width" in size:
            # 将 `size` 设为包含高度和宽度的元组
            size = (size["height"], size["width"])
        else:
            # 如果 `size` 字典中既没有 'shortest_edge' 也没有同时包含 'height' 和 'width' 键，则抛出异常
            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
        
        # 获取调整大小后的输出图像尺寸
        output_size = get_resize_output_image_size(
            image,
            size=size,
            default_to_square=default_to_square,
            input_data_format=input_data_format,
        )
        # 调用 resize 函数进行图像调整大小操作，并返回调整大小后的图像
        return resize(
            image,
            size=output_size,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
            **kwargs,
        )

    def preprocess(
        self,
        images: ImageInput,
        do_resize: bool = None,
        size: Dict[str, int] = None,
        resample: PILImageResampling = None,
        do_center_crop: bool = None,
        crop_size: int = None,
        do_rescale: bool = None,
        rescale_factor: float = None,
        do_normalize: bool = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        **kwargs,
    ):

`.\models\bit\modeling_bit.py`

# coding=utf-8
# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch BiT model. Also supports backbone for ViT hybrid."""

import collections
import math
from typing import Optional, Tuple

import numpy as np
import torch
import torch.utils.checkpoint
from torch import Tensor, nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
from ...modeling_outputs import (
    BackboneOutput,
    BaseModelOutputWithNoAttention,
    BaseModelOutputWithPoolingAndNoAttention,
    ImageClassifierOutputWithNoAttention,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ...utils.backbone_utils import BackboneMixin
from .configuration_bit import BitConfig


logger = logging.get_logger(__name__)

# General docstring
_CONFIG_FOR_DOC = "BitConfig"

# Base docstring
_CHECKPOINT_FOR_DOC = "google/bit-50"
_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7]

# Image classification docstring
_IMAGE_CLASS_CHECKPOINT = "google/bit-50"
_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"

BIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "google/bit-50",
    # See all BiT models at https://huggingface.co/models?filter=bit
]


def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tuple[Tuple, bool]:
    r"""
    Utility function to get the tuple padding value given the kernel_size and padding.

    Args:
        padding (Union[`str`, `int`], *optional*):
            Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
            PyTorch is used.
        kernel_size (`int`, *optional*, defaults to 7):
            Kernel size of the convolution layers.
        stride (`int`, *optional*, defaults to 1):
            Stride value of the convolution layers.
        dilation (`int`, *optional*, defaults to 1):
            Dilation value of the convolution layers.
    """
    # Determine if padding should be dynamically calculated
    dynamic = False
    # If padding is not provided, calculate it based on convolution parameters
    if padding is None:
        padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
    # Return the calculated padding and a boolean indicating if it's dynamic
    return padding, dynamic
    # 如果 padding 是字符串类型，将其转换为小写
    if isinstance(padding, str):
        # 如果 padding 是字符串 "same"
        padding = padding.lower()
        if padding == "same":
            # 对于 TF 兼容的 'SAME' padding，会影响性能和 GPU 内存分配
            # 当 stride 等于 1 并且 (dilation * (kernel_size - 1)) % 2 等于 0 时，静态情况下没有额外开销
            if stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0:
                padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
            else:
                # 动态 'SAME' padding，会有运行时和 GPU 内存的额外开销
                padding = 0
                dynamic = True
        elif padding == "valid":
            # 'VALID' padding，相当于 padding=0
            padding = 0
        else:
            # 默认使用类似 PyTorch 风格的对称 'same' padding
            padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
    # 返回计算后的 padding 值和 dynamic 变量
    return padding, dynamic
# 定义一个继承自 nn.Conv2d 的类，用于实现带有权重标准化的二维卷积操作
class WeightStandardizedConv2d(nn.Conv2d):
    """Conv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model.

    Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
    Standardization](https://arxiv.org/abs/1903.10520v2)
    """

    def __init__(
        self,
        in_channel,
        out_channels,
        kernel_size,
        stride=1,
        padding="SAME",
        dilation=1,
        groups=1,
        bias=False,
        eps=1e-6,
    ):
        # 根据 padding 参数获取填充值及是否动态计算标志
        padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
        # 调用父类的初始化方法，设置卷积层的各个参数
        super().__init__(
            in_channel,
            out_channels,
            kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            groups=groups,
            bias=bias,
        )
        # 根据是否动态填充选择相应的填充方法
        if is_dynamic:
            self.pad = DynamicPad2d(kernel_size, stride, dilation)
        else:
            self.pad = None
        self.eps = eps

    def forward(self, hidden_state):
        # 如果存在动态填充方法，则对输入进行填充操作
        if self.pad is not None:
            hidden_state = self.pad(hidden_state)
        # 对卷积核进行权重标准化操作
        weight = nn.functional.batch_norm(
            self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps
        ).reshape_as(self.weight)
        # 执行卷积操作，并返回处理后的 hidden_state
        hidden_state = nn.functional.conv2d(
            hidden_state, weight, self.bias, self.stride, self.padding, self.dilation, self.groups
        )
        return hidden_state


class BitGroupNormActivation(nn.GroupNorm):
    r"""
    A module that combines group normalization with an activation function.
    """

    def __init__(self, config, num_channels, eps=1e-5, affine=True, apply_activation=True):
        # 调用父类 nn.GroupNorm 的初始化方法，设置组归一化的参数
        super(BitGroupNormActivation, self).__init__(config.num_groups, num_channels, eps=eps, affine=affine)
        # 根据 apply_activation 参数选择激活函数
        if apply_activation:
            self.activation = ACT2FN[config.hidden_act]
        else:
            self.activation = nn.Identity()

    def forward(self, hidden_state):
        # 执行组归一化操作，并应用选择的激活函数
        hidden_state = nn.functional.group_norm(hidden_state, self.num_groups, self.weight, self.bias, self.eps)
        hidden_state = self.activation(hidden_state)
        return hidden_state


class DynamicPad2d(nn.Module):
    r"""
    A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
    hidden states.
    """

    # 此处省略了初始化方法的注释，根据示例中不需要对该部分进行注释
    def __init__(self, kernel_size, stride, dilation, value=0):
        super().__init__()
        # Safety checkers
        # 如果 kernel_size 是整数，则转换为元组 (kernel_size, kernel_size)
        if isinstance(kernel_size, int):
            kernel_size = (kernel_size, kernel_size)

        # 如果 stride 是整数，则转换为元组 (stride, stride)
        if isinstance(stride, int):
            stride = (stride, stride)

        # 如果 dilation 是整数，则转换为元组 (dilation, dilation)
        if isinstance(dilation, int):
            dilation = (dilation, dilation)

        # 将参数存储到对象的属性中
        self.kernel_size = kernel_size
        self.stride = stride
        self.dilation = dilation
        self.value = value

        # 定义一个内部方法 compute_padding，用于计算填充值
        def compute_padding(x, kernel_size, stride, dilation):
            return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)

        # 将 compute_padding 方法存储到对象的属性中
        self.compute_padding = compute_padding

    def __call__(self, input):
        # 获取输入张量的高度和宽度
        input_height, input_width = input.size()[-2:]

        # 计算高度方向的填充值
        padding_height = self.compute_padding(input_height, self.kernel_size[0], self.stride[0], self.dilation[0])

        # 计算宽度方向的填充值
        padding_width = self.compute_padding(input_width, self.kernel_size[1], self.stride[1], self.dilation[1])

        # 如果需要进行填充
        if padding_height > 0 or padding_width > 0:
            # 使用 nn.functional.pad 对输入张量进行填充
            input = nn.functional.pad(
                input,
                [
                    padding_width // 2,
                    padding_width - padding_width // 2,
                    padding_height // 2,
                    padding_height - padding_height // 2,
                ],
                value=self.value,
            )
        # 返回填充后的输入张量
        return input
# 定义一个自定义的2D最大池化层，实现类似于TensorFlow中'SAME'的功能
class BitMaxPool2d(nn.MaxPool2d):
    """Tensorflow like 'SAME' wrapper for 2D max pooling"""

    def __init__(
        self,
        kernel_size: int,
        stride=None,
        dilation=1,
        ceil_mode=False,
        padding=(0, 0),
        padding_value=0,
        use_dynamic_padding=True,
    ):
        # 如果kernel_size不是可迭代对象，则转换为元组形式
        kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
        # 如果stride不是可迭代对象，则转换为元组形式
        stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
        # 如果dilation不是可迭代对象，则转换为元组形式
        dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation)
        # 调用父类的初始化方法，设置kernel_size, stride, padding, dilation, ceil_mode
        super().__init__(kernel_size, stride, padding, dilation, ceil_mode)
        # 如果使用动态填充
        if use_dynamic_padding:
            # 初始化一个动态填充对象DynamicPad2d，并赋值给self.pad
            self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value)
        else:
            # 否则使用nn.Identity()作为填充层
            self.pad = nn.Identity()

    def forward(self, hidden_states):
        # 对输入的hidden_states进行填充
        hidden_states = self.pad(hidden_states)
        # 使用nn.functional.max_pool2d进行最大池化操作，返回池化后的结果
        return nn.functional.max_pool2d(
            hidden_states, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode
        )


class BitEmbeddings(nn.Module):
    """
    BiT Embeddings (stem) composed of a single aggressive convolution.
    """

    def __init__(self, config: BitConfig):
        # 调用父类的初始化方法
        super().__init__()

        # 定义一个权重标准化的2D卷积层，作为BiT嵌入的第一层
        self.convolution = WeightStandardizedConv2d(
            config.num_channels,
            config.embedding_size,
            kernel_size=7,
            stride=2,
            eps=1e-8,
            padding=config.global_padding,
        )

        # 定义一个BitMaxPool2d对象作为池化层，处理卷积层输出的特征图
        self.pooler = BitMaxPool2d(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding)

        # 如果全局填充策略为'SAME'，则使用nn.Identity()作为填充层，否则使用常数填充
        if config.global_padding is not None and config.global_padding.upper() == "SAME":
            self.pad = nn.Identity()
        else:
            self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0)

        # 如果层类型不是'preactivation'，则使用BitGroupNormActivation进行归一化和激活，否则使用nn.Identity()
        if not config.layer_type == "preactivation":
            self.norm = BitGroupNormActivation(config, num_channels=config.embedding_size)
        else:
            self.norm = nn.Identity()

        self.num_channels = config.num_channels

    def forward(self, pixel_values: Tensor) -> Tensor:
        # 检查输入的通道维度是否与配置中设置的通道数相匹配
        num_channels = pixel_values.shape[1]
        if num_channels != self.num_channels:
            raise ValueError(
                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
            )

        # 对输入的像素值进行卷积操作，得到嵌入表示
        embedding = self.convolution(pixel_values)

        # 对卷积层的输出进行填充
        embedding = self.pad(embedding)

        # 对填充后的特征图进行归一化和激活处理
        embedding = self.norm(embedding)

        # 对归一化后的特征图进行池化操作
        embedding = self.pooler(embedding)

        # 返回最终的嵌入表示
        return embedding


# 从transformers.models.convnext.modeling_convnext.drop_path中复制的drop_path函数
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
    """
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    """
    # 省略部分，实现dropout功能，但此处未提供完整实现，仅有文档字符串和函数声明
    # 如果 drop_prob 为 0 或者不处于训练状态，则直接返回输入
    if drop_prob == 0.0 or not training:
        return input
    # 计算保留概率
    keep_prob = 1 - drop_prob
    # 确定随机张量的形状，保证适用于不同维度的张量，而不仅仅是二维卷积网络
    shape = (input.shape[0],) + (1,) * (input.ndim - 1)
    # 生成与输入张量相同设备和数据类型的随机张量
    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
    random_tensor.floor_()  # 将随机张量二值化
    # 计算输出值，通过随机张量进行 Dropout 操作
    output = input.div(keep_prob) * random_tensor
    # 返回 Dropout 后的输出张量
    return output
# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit
class BitDropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""

    def __init__(self, drop_prob: Optional[float] = None) -> None:
        super().__init__()
        self.drop_prob = drop_prob  # 初始化时设置 dropout 的概率

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 调用 drop_path 函数对输入的 hidden_states 进行随机深度(drop path)操作
        return drop_path(hidden_states, self.drop_prob, self.training)

    def extra_repr(self) -> str:
        # 返回模块的额外描述信息，包括 dropout 的概率
        return "p={}".format(self.drop_prob)


def make_div(value, divisor=8):
    # 计算 value 的最接近的大于 divisor 的整数倍的数
    min_value = divisor
    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
    if new_value < 0.9 * value:
        new_value += divisor
    return new_value


class BitPreActivationBottleneckLayer(nn.Module):
    """Pre-activation (v2) bottleneck block.
    Follows the implementation of "Identity Mappings in Deep Residual Networks":
    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua

    Except it puts the stride on 3x3 conv when available.
    """

    def __init__(
        self,
        config,
        in_channels,
        out_channels=None,
        bottle_ratio=0.25,
        stride=1,
        dilation=1,
        first_dilation=None,
        groups=1,
        drop_path_rate=0.0,
        is_first_layer=False,
    ):
        super().__init__()

        first_dilation = first_dilation or dilation

        out_channels = out_channels or in_channels
        mid_channels = make_div(out_channels * bottle_ratio)

        if is_first_layer:
            # 如果是第一层，则初始化 downsample 为 BitDownsampleConv 对象
            self.downsample = BitDownsampleConv(
                config,
                in_channels,
                out_channels,
                stride=stride,
                preact=True,
            )
        else:
            self.downsample = None  # 否则 downsample 为 None

        self.norm1 = BitGroupNormActivation(config, in_channels)  # 第一个规范化与激活层
        self.conv1 = WeightStandardizedConv2d(in_channels, mid_channels, 1, eps=1e-8, padding=config.global_padding)  # 第一个卷积层

        self.norm2 = BitGroupNormActivation(config, num_channels=mid_channels)  # 第二个规范化与激活层
        self.conv2 = WeightStandardizedConv2d(
            mid_channels, mid_channels, 3, stride=stride, groups=groups, eps=1e-8, padding=config.global_padding
        )  # 第二个卷积层，带有可能的步幅和分组设置

        self.norm3 = BitGroupNormActivation(config, mid_channels)  # 第三个规范化与激活层
        self.conv3 = WeightStandardizedConv2d(mid_channels, out_channels, 1, eps=1e-8, padding=config.global_padding)  # 第三个卷积层

        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()  # 设置随机深度(drop path)模块
    # 前向传播函数，接受隐藏状态作为输入
    def forward(self, hidden_states):
        # 对隐藏状态进行层归一化处理
        hidden_states_preact = self.norm1(hidden_states)

        # 生成快捷路径分支
        shortcut = hidden_states
        # 如果定义了下采样函数，则对层归一化后的隐藏状态进行下采样
        if self.downsample is not None:
            shortcut = self.downsample(hidden_states_preact)

        # 残差分支
        # 对归一化后的隐藏状态进行第一次卷积操作
        hidden_states = self.conv1(hidden_states_preact)
        # 对第一次卷积后的结果进行第二次卷积和归一化处理
        hidden_states = self.conv2(self.norm2(hidden_states))
        # 对第二次卷积后的结果进行第三次卷积和归一化处理
        hidden_states = self.conv3(self.norm3(hidden_states))
        # 执行 dropout 路径
        hidden_states = self.drop_path(hidden_states)
        # 将残差分支的输出与快捷路径的输出相加作为最终的隐藏状态输出
        return hidden_states + shortcut
class BitBottleneckLayer(nn.Module):
    """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid."""

    def __init__(
        self,
        config,
        in_channels,
        out_channels=None,
        bottle_ratio=0.25,
        stride=1,
        dilation=1,
        first_dilation=None,
        groups=1,
        drop_path_rate=0.0,
        is_first_layer=False,
    ):
        super().__init__()
        first_dilation = first_dilation or dilation  # 如果未指定，则使用 dilation 参数的值作为 first_dilation

        out_channels = out_channels or in_channels  # 如果未指定输出通道数，则使用输入通道数
        mid_chs = make_div(out_channels * bottle_ratio)  # 计算中间通道数，通过 make_div 函数调整为可被整除的数

        if is_first_layer:
            # 如果是第一层，则使用 BitDownsampleConv 类进行下采样
            self.downsample = BitDownsampleConv(
                config,
                in_channels,
                out_channels,
                stride=stride,
                preact=False,  # 不进行预激活
            )
        else:
            self.downsample = None  # 否则不进行下采样

        # 第一层卷积
        self.conv1 = WeightStandardizedConv2d(in_channels, mid_chs, 1, eps=1e-8, padding=config.global_padding)
        self.norm1 = BitGroupNormActivation(config, num_channels=mid_chs)  # 中间层的规范化和激活

        # 第二层卷积
        self.conv2 = WeightStandardizedConv2d(
            mid_chs,
            mid_chs,
            3,
            stride=stride,
            dilation=first_dilation,
            groups=groups,
            eps=1e-8,
            padding=config.global_padding,
        )
        self.norm2 = BitGroupNormActivation(config, num_channels=mid_chs)  # 第二层的规范化和激活

        # 第三层卷积
        self.conv3 = WeightStandardizedConv2d(mid_chs, out_channels, 1, eps=1e-8, padding=config.global_padding)
        self.norm3 = BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)  # 输出层的规范化，不进行激活

        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()  # Dropout 路径

        self.activation = ACT2FN[config.hidden_act]  # 激活函数选择

    def forward(self, hidden_states):
        # shortcut 分支，即残差连接
        shortcut = hidden_states
        if self.downsample is not None:
            shortcut = self.downsample(hidden_states)  # 如果有下采样，则应用到 shortcut 上

        # 残差连接
        hidden_states = self.conv1(hidden_states)  # 第一层卷积
        hidden_states = self.norm1(hidden_states)  # 第一层规范化和激活

        hidden_states = self.conv2(hidden_states)  # 第二层卷积
        hidden_states = self.norm2(hidden_states)  # 第二层规范化和激活

        hidden_states = self.conv3(hidden_states)  # 第三层卷积
        hidden_states = self.norm3(hidden_states)  # 输出层规范化，不进行激活

        hidden_states = self.drop_path(hidden_states)  # 应用 dropout 路径

        hidden_states = self.activation(hidden_states + shortcut)  # 加上残差连接后应用激活函数
        return hidden_states


class BitDownsampleConv(nn.Module):
    def __init__(
        self,
        config,
        in_channels,
        out_channels,
        stride=1,
        preact=True,
    ):
        super().__init__()
        self.conv = WeightStandardizedConv2d(
            in_channels, out_channels, 1, stride=stride, eps=1e-8, padding=config.global_padding
        )
        self.norm = (
            nn.Identity()
            if preact
            else BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)
        )  # 如果 preact 是 True，则使用 nn.Identity()，否则使用 BitGroupNormActivation 进行规范化
    # 定义一个前向传播的方法，接受输入 x
    def forward(self, x):
        # 先将输入 x 经过卷积操作 conv
        conv_output = self.conv(x)
        # 然后对卷积输出进行归一化操作 norm
        normalized_output = self.norm(conv_output)
        # 返回归一化后的结果
        return normalized_output
# 定义一个 ResNet v2 阶段，由堆叠的层组成
class BitStage(nn.Module):
    """
    A ResNet v2 stage composed by stacked layers.
    """

    def __init__(
        self,
        config,
        in_channels,
        out_channels,
        stride,
        dilation,
        depth,
        bottle_ratio=0.25,
        layer_dropout=None,
    ):
        super().__init__()

        # 根据 dilation 参数设置第一个层的扩张率
        first_dilation = 1 if dilation in (1, 2) else 2

        # 根据配置选择使用 Bottleneck 层或者 PreActivationBottleneckLayer 层
        if config.layer_type == "bottleneck":
            layer_cls = BitBottleneckLayer
        else:
            layer_cls = BitPreActivationBottleneckLayer

        prev_chs = in_channels
        self.layers = nn.Sequential()
        # 创建指定深度的层堆叠
        for layer_idx in range(depth):
            # 获取当前层的超参数
            stride, drop_path_rate, is_first_layer = self._get_updated_hyperparameters(
                layer_idx, stride, layer_dropout
            )

            # 将当前层添加到层序列中
            self.layers.add_module(
                str(layer_idx),
                layer_cls(
                    config,
                    prev_chs,
                    out_channels,
                    stride=stride,
                    dilation=dilation,
                    bottle_ratio=bottle_ratio,
                    first_dilation=first_dilation,
                    drop_path_rate=drop_path_rate,
                    is_first_layer=is_first_layer,
                ),
            )
            prev_chs = out_channels
            first_dilation = dilation

    # 获取更新后的超参数的内部方法
    def _get_updated_hyperparameters(self, layer_idx, stride, layer_dropout):
        """
        Get the new hyper-parameters with respect to the previous ones and the index of the current layer.
        """
        # 如果存在层的 dropout 设置，则获取当前层的 dropout rate
        if layer_dropout:
            drop_path_rate = layer_dropout[layer_idx]
        else:
            drop_path_rate = 0.0

        # 如果不是第一层，则将 stride 设置为 1
        if layer_idx != 0:
            stride = 1

        # 判断当前层是否是第一层
        is_first_layer = layer_idx == 0

        return stride, drop_path_rate, is_first_layer

    # 前向传播方法，遍历每一层并依次传递输入
    def forward(self, input: Tensor) -> Tensor:
        hidden_state = input
        for _, layer in enumerate(self.layers):
            hidden_state = layer(hidden_state)
        return hidden_state
    # 初始化方法，接受一个BitConfig类型的配置对象作为参数
    def __init__(self, config: BitConfig):
        # 调用父类的初始化方法
        super().__init__()
        # 初始化一个空的神经网络模块列表
        self.stages = nn.ModuleList([])

        # 初始通道数设为配置对象中的嵌入大小
        prev_chs = config.embedding_size

        # 固定设定的当前步幅为4，膨胀率为1
        current_stride = 4
        dilation = 1

        # 计算每个层的丢弃率列表
        layer_dropouts = [
            x.tolist()
            for x in torch.Tensor(np.linspace(0, config.drop_path_rate, sum(config.depths))).split(config.depths)
        ]

        # 遍历每个阶段，其中current_depth为层深度，current_hidden_size为隐藏层大小，layer_dropout为层丢弃率
        for stage_idx, (current_depth, current_hidden_size, layer_dropout) in enumerate(
            zip(config.depths, config.hidden_sizes, layer_dropouts)
        ):
            # 获取更新后的超参数
            out_channels, stride, dilation = self._get_updated_hyperparameters(
                stage_idx, current_stride, current_hidden_size, dilation, config
            )

            # 创建BitStage模块，并添加到self.stages中
            stage = BitStage(
                config,
                prev_chs,
                out_channels,
                stride=stride,
                dilation=dilation,
                depth=current_depth,
                layer_dropout=layer_dropout,
            )

            prev_chs = out_channels
            current_stride *= stride

            self.stages.add_module(str(stage_idx), stage)

    # 获取更新后的超参数方法
    def _get_updated_hyperparameters(self, stage_idx, current_stride, current_hidden_size, dilation, config):
        # 计算输出通道数，确保是可被整除的
        out_channels = make_div(current_hidden_size * config.width_factor)
        # 首个阶段步幅设为1，其余为2
        stride = 1 if stage_idx == 0 else 2
        # 若当前步幅超过设定的输出步幅，则调整膨胀率和步幅
        if current_stride >= config.output_stride:
            dilation *= stride
            stride = 1
        # 返回更新后的输出通道数、步幅和膨胀率
        return out_channels, stride, dilation

    # 前向传播方法
    def forward(
        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
    ) -> BaseModelOutputWithNoAttention:
        # 如果需要输出隐藏状态，则初始化一个空的隐藏状态元组
        hidden_states = () if output_hidden_states else None

        # 遍历每个BitStage模块进行前向传播
        for stage_module in self.stages:
            if output_hidden_states:
                hidden_states = hidden_states + (hidden_state,)

            hidden_state = stage_module(hidden_state)

        # 如果需要输出隐藏状态，则将最终隐藏状态添加到隐藏状态元组中
        if output_hidden_states:
            hidden_states = hidden_states + (hidden_state,)

        # 如果不需要以字典形式返回结果，则按需返回隐藏状态和隐藏状态元组
        if not return_dict:
            return tuple(v for v in [hidden_state, hidden_states] if v is not None)

        # 以BaseModelOutputWithNoAttention类的实例形式返回结果
        return BaseModelOutputWithNoAttention(
            last_hidden_state=hidden_state,
            hidden_states=hidden_states,
        )
@add_start_docstrings(
    "The bare BiT model outputting raw features without any specific head on top.",
    BIT_START_DOCSTRING,
)
class BitModel(BitPreTrainedModel):
    """
    BiT 模型的抽象类，负责权重初始化、预训练模型的下载和加载接口。
    """

    def __init__(self, config):
        """
        初始化函数，设置模型结构及参数。

        Args:
            config (BitConfig): 模型的配置类，包含模型的所有参数。

        Attributes:
            embedder (BitEmbeddings): BiT 模型的嵌入层。
            encoder (BitEncoder): BiT 模型的编码器。
            norm (nn.Module): 规范化层，根据配置决定是分组规范化还是身份映射。
            pooler (nn.Module): 自适应平均池化层，用于汇总特征。
        """
        super().__init__(config)
        self.config = config

        self.embedder = BitEmbeddings(config)
        self.encoder = BitEncoder(config)
        self.norm = (
            BitGroupNormActivation(config, num_channels=config.hidden_sizes[-1])
            if config.layer_type == "preactivation"
            else nn.Identity()
        )
        self.pooler = nn.AdaptiveAvgPool2d((1, 1))
        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndNoAttention,
        config_class=_CONFIG_FOR_DOC,
        modality="vision",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
    ) -> Union[ModelOutput, Tuple[Tensor]]:
        """
        BiT 模型的前向传播函数。

        Args:
            pixel_values (Tensor): 输入的像素值张量，形状为(batch_size, num_channels, height, width)。
            output_hidden_states (bool, optional): 是否返回所有层的隐藏状态。
            return_dict (bool, optional): 是否返回一个 ModelOutput 而不是普通元组。

        Returns:
            Union[ModelOutput, Tuple[Tensor]]: 根据 return_dict 参数返回不同形式的输出。

        Raises:
            NotImplementedError: 如果未指定返回格式，则抛出错误。
        """
        raise NotImplementedError
    # 定义方法的返回类型为BaseModelOutputWithPoolingAndNoAttention
    ) -> BaseModelOutputWithPoolingAndNoAttention:
        # 如果output_hidden_states参数不为None，则使用传入的值；否则使用self.config.output_hidden_states的默认配置值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果return_dict参数不为None，则使用传入的值；否则使用self.config.use_return_dict的默认配置值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 使用embedder方法将pixel_values转换为嵌入表示
        embedding_output = self.embedder(pixel_values)

        # 使用encoder方法对嵌入表示进行编码，根据参数output_hidden_states和return_dict是否返回字典
        encoder_outputs = self.encoder(
            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
        )

        # 获取编码器输出的最后一个隐藏状态
        last_hidden_state = encoder_outputs[0]

        # 对最后一个隐藏状态进行归一化处理
        last_hidden_state = self.norm(last_hidden_state)

        # 使用pooler方法对归一化后的最后一个隐藏状态进行池化
        pooled_output = self.pooler(last_hidden_state)

        # 如果return_dict为False，则返回一个元组，包含最后一个隐藏状态、池化输出以及其他编码器输出的隐藏状态列表
        if not return_dict:
            return (last_hidden_state, pooled_output) + encoder_outputs[1:]

        # 否则，返回一个BaseModelOutputWithPoolingAndNoAttention对象，包含最后一个隐藏状态、池化输出以及所有隐藏状态的列表
        return BaseModelOutputWithPoolingAndNoAttention(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
            hidden_states=encoder_outputs.hidden_states,
        )
# 使用装饰器为类添加文档字符串，描述这是一个在图像分类任务上使用的 BiT 模型，例如用于 ImageNet
@add_start_docstrings(
    """
    BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    """,
    BIT_START_DOCSTRING,  # 引用全局定义的 BiT 模型文档字符串模板
)
class BitForImageClassification(BitPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels  # 从配置中获取标签数量
        self.bit = BitModel(config)  # 创建 BiT 模型实例
        # 分类头部，根据配置决定输出层结构
        self.classifier = nn.Sequential(
            nn.Flatten(),  # 将输入展平
            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),  # 添加线性分类层或者恒等映射层
        )
        # 初始化权重并进行最终处理
        self.post_init()

    # 使用装饰器为前向传播函数添加模型文档字符串，描述输入和输出的格式
    @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_IMAGE_CLASS_CHECKPOINT,  # 引用全局的模型检查点信息
        output_type=ImageClassifierOutputWithNoAttention,  # 引用全局的输出类型定义
        config_class=_CONFIG_FOR_DOC,  # 引用全局的配置类信息
        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,  # 引用全局的预期输出信息
    )
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,  # 输入像素值的张量，可选
        labels: Optional[torch.LongTensor] = None,  # 真实标签的张量，可选
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态的张量，可选
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出，可选
        # 注意：函数定义没有被完全展示，继续在下文中
    ) -> ImageClassifierOutputWithNoAttention:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 如果 `return_dict` 不为 None，则使用它；否则使用 self.config.use_return_dict
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 self.bit 方法，传入像素值 `pixel_values`，根据 `output_hidden_states` 和 `return_dict` 的值返回输出
        outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)

        # 如果 return_dict 为 True，则使用 outputs.pooler_output 作为 pooled_output，否则使用 outputs 的第二个元素
        pooled_output = outputs.pooler_output if return_dict else outputs[1]

        # 将 pooled_output 输入分类器 self.classifier，得到 logits
        logits = self.classifier(pooled_output)

        # 初始化 loss 为 None
        loss = None

        # 如果 labels 不为 None，则计算损失函数
        if labels is not None:
            # 如果 self.config.problem_type 为 None，则根据情况设置问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"
            
            # 根据问题类型计算相应的损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    # 对于单个标签的回归任务，计算 logits 和 labels 的均方误差损失
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    # 对于多标签回归任务，计算 logits 和 labels 的均方误差损失
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                # 对于单标签分类任务，使用交叉熵损失函数
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                # 对于多标签分类任务，使用带 logits 的二进制交叉熵损失函数
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果 return_dict 为 False，则构建输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            # 返回损失与输出元组，如果损失不为 None，则包含损失
            return (loss,) + output if loss is not None else output

        # 返回 ImageClassifierOutputWithNoAttention 对象，包含损失、logits 和 hidden_states
        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
@add_start_docstrings(
    """
    BiT backbone, to be used with frameworks like DETR and MaskFormer.
    """,
    BIT_START_DOCSTRING,
)
class BitBackbone(BitPreTrainedModel, BackboneMixin):
    def __init__(self, config):
        # 调用父类的初始化方法，传入配置参数
        super().__init__(config)
        # 调用父类的初始化背骨方法
        super()._init_backbone(config)

        # 创建 BiT 模型实例
        self.bit = BitModel(config)
        # 计算特征维度列表，包括嵌入大小和隐藏层大小
        self.num_features = [config.embedding_size] + config.hidden_sizes

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
    ) -> BackboneOutput:
        """
        Returns:

        Examples:

        ```
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("google/resnetnv2-50")
        >>> model = AutoBackbone.from_pretrained("google/resnetnv2-50")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```"""
        # 如果未提供返回字典参数，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 如果未提供输出隐藏状态参数，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )

        # 调用 BiT 模型的前向传播，返回输出结果
        outputs = self.bit(pixel_values, output_hidden_states=True, return_dict=True)

        hidden_states = outputs.hidden_states

        feature_maps = ()
        for idx, stage in enumerate(self.stage_names):
            # 如果当前阶段在输出特征名称列表中
            if stage in self.out_features:
                feature_maps += (hidden_states[idx],)

        # 如果不要求返回字典，则返回元组形式的输出
        if not return_dict:
            output = (feature_maps,)
            if output_hidden_states:
                output += (outputs.hidden_states,)
            return output

        # 返回包含特征映射、隐藏状态（如果需要）、注意力（默认为None）的 BackboneOutput 对象
        return BackboneOutput(
            feature_maps=feature_maps,
            hidden_states=outputs.hidden_states if output_hidden_states else None,
            attentions=None,
        )

`.\models\bit\init.py`

# 版权声明及许可证信息
# 2022 年版权归 HuggingFace 团队所有。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本许可，除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"分发，不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。
from typing import TYPE_CHECKING

# 从工具包中导入异常处理和模块延迟加载工具
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available

# 定义导入结构，初始化空字典
_import_structure = {"configuration_bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig", "BitOnnxConfig"]}

# 检查是否可用 Torch 库，若不可用则引发异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加模型相关的导入结构到_import_structure字典中
    _import_structure["modeling_bit"] = [
        "BIT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BitForImageClassification",
        "BitModel",
        "BitPreTrainedModel",
        "BitBackbone",
    ]

# 检查是否可用 Vision 库，若不可用则引发异常
try:
    if not is_vision_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，添加图像处理相关的导入结构到_import_structure字典中
    _import_structure["image_processing_bit"] = ["BitImageProcessor"]

# 如果是类型检查阶段，则从相应模块中导入特定类和常量
if TYPE_CHECKING:
    from .configuration_bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig, BitOnnxConfig

    # 检查是否可用 Torch 库，若不可用则引发异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从模型相关模块中导入特定类
        from .modeling_bit import (
            BIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            BitBackbone,
            BitForImageClassification,
            BitModel,
            BitPreTrainedModel,
        )

    # 检查是否可用 Vision 库，若不可用则引发异常
    try:
        if not is_vision_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用，从图像处理相关模块中导入特定类
        from .image_processing_bit import BitImageProcessor

# 如果不是类型检查阶段，则将当前模块设置为懒加载模块
else:
    import sys

    # 将当前模块替换为懒加载模块，用于延迟导入指定的结构
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)

`.\models\blenderbot\configuration_blenderbot.py`

# coding=utf-8
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Blenderbot model configuration
"""

# 导入必要的模块和类
from collections import OrderedDict  # 导入有序字典类
from typing import Any, Mapping, Optional  # 导入类型提示相关的类和函数

from ... import PreTrainedTokenizer  # 导入预训练分词器类
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...file_utils import TensorType, is_torch_available  # 导入与文件操作相关的函数和类
from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast  # 导入与ONNX相关的配置类
from ...onnx.utils import compute_effective_axis_dimension  # 导入计算有效轴维度的函数
from ...utils import logging  # 导入日志记录工具


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器

BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/config.json",
    # 查看所有Blenderbot模型请访问 https://huggingface.co/models?filter=blenderbot
}

class BlenderbotConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`BlenderbotModel`]. It is used to instantiate an
    Blenderbot model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the Blenderbot
    [facebook/blenderbot-3B](https://huggingface.co/facebook/blenderbot-3B) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```
    >>> from transformers import BlenderbotConfig, BlenderbotModel

    >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
    >>> configuration = BlenderbotConfig()

    >>> # Initializing a model (with random weights) from the facebook/blenderbot-3B style configuration
    >>> model = BlenderbotModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    """

    model_type = "blenderbot"  # 模型类型为Blenderbot
    keys_to_ignore_at_inference = ["past_key_values"]  # 推理时需要忽略的关键字列表
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}  # 属性映射表，用于重命名属性
    # 初始化函数，用于初始化一个Transformer模型的实例
    def __init__(
        self,
        vocab_size=8008,  # 词汇表大小，默认为8008
        max_position_embeddings=128,  # 最大位置编码长度，默认为128
        encoder_layers=2,  # 编码器层数，默认为2层
        encoder_ffn_dim=10240,  # 编码器中FFN层的维度，默认为10240
        encoder_attention_heads=32,  # 编码器注意力头数，默认为32
        decoder_layers=24,  # 解码器层数，默认为24层
        decoder_ffn_dim=10240,  # 解码器中FFN层的维度，默认为10240
        decoder_attention_heads=32,  # 解码器注意力头数，默认为32
        encoder_layerdrop=0.0,  # 编码器层的dropout比例，默认为0.0（无dropout）
        decoder_layerdrop=0.0,  # 解码器层的dropout比例，默认为0.0（无dropout）
        use_cache=True,  # 是否使用缓存，默认为True
        is_encoder_decoder=True,  # 是否为编码-解码模型，默认为True
        activation_function="gelu",  # 激活函数类型，默认为GELU
        d_model=2560,  # 模型维度，默认为2560
        dropout=0.1,  # 全连接层的dropout比例，默认为0.1
        attention_dropout=0.0,  # 注意力层的dropout比例，默认为0.0（无dropout）
        activation_dropout=0.0,  # 激活函数的dropout比例，默认为0.0（无dropout）
        init_std=0.02,  # 初始化标准差，默认为0.02
        decoder_start_token_id=1,  # 解码器起始标记ID，默认为1
        scale_embedding=False,  # 是否缩放嵌入向量，默认为False
        pad_token_id=0,  # 填充标记ID，默认为0
        bos_token_id=1,  # 起始标记ID，默认为1
        eos_token_id=2,  # 结束标记ID，默认为2
        encoder_no_repeat_ngram_size=3,  # 编码器中不重复ngram的大小，默认为3
        forced_eos_token_id=2,  # 强制结束标记ID，默认为2
        **kwargs,  # 其他参数
    ):
        self.vocab_size = vocab_size  # 设置词汇表大小
        self.max_position_embeddings = max_position_embeddings  # 设置最大位置编码长度
        self.d_model = d_model  # 设置模型维度
        self.encoder_ffn_dim = encoder_ffn_dim  # 设置编码器中FFN层的维度
        self.encoder_layers = encoder_layers  # 设置编码器层数
        self.encoder_attention_heads = encoder_attention_heads  # 设置编码器注意力头数
        self.decoder_ffn_dim = decoder_ffn_dim  # 设置解码器中FFN层的维度
        self.decoder_layers = decoder_layers  # 设置解码器层数
        self.decoder_attention_heads = decoder_attention_heads  # 设置解码器注意力头数
        self.dropout = dropout  # 设置全连接层的dropout比例
        self.attention_dropout = attention_dropout  # 设置注意力层的dropout比例
        self.activation_dropout = activation_dropout  # 设置激活函数的dropout比例
        self.activation_function = activation_function  # 设置激活函数类型
        self.init_std = init_std  # 设置初始化标准差
        self.encoder_layerdrop = encoder_layerdrop  # 设置编码器层的dropout比例
        self.decoder_layerdrop = decoder_layerdrop  # 设置解码器层的dropout比例
        self.use_cache = use_cache  # 设置是否使用缓存
        self.num_hidden_layers = encoder_layers  # 设置隐藏层的数量等于编码器层数
        self.scale_embedding = scale_embedding  # 设置是否缩放嵌入向量，如果是，缩放因子为sqrt(d_model)

        # 调用父类的初始化方法，传入相关参数
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
            decoder_start_token_id=decoder_start_token_id,
            encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
            forced_eos_token_id=forced_eos_token_id,
            **kwargs,
        )
    # 定义 BlenderbotOnnxConfig 类，继承自 OnnxSeq2SeqConfigWithPast 类
    class BlenderbotOnnxConfig(OnnxSeq2SeqConfigWithPast):
        
        # 定义 inputs 属性，返回输入的字典映射
        @property
        def inputs(self) -> Mapping[str, Mapping[int, str]]:
            # 根据任务类型决定通用输入格式
            if self.task in ["default", "seq2seq-lm"]:
                # 如果任务是默认或者序列到序列语言模型
                common_inputs = OrderedDict(
                    [
                        ("input_ids", {0: "batch", 1: "encoder_sequence"}),
                        ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
                    ]
                )
                # 如果使用过去状态，则设置额外的输入
                if self.use_past:
                    common_inputs["decoder_input_ids"] = {0: "batch"}
                    common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
                else:
                    common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
                    common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
                # 如果使用过去状态，填充具有过去关键值的公共输入
                if self.use_past:
                    self.fill_with_past_key_values_(common_inputs, direction="inputs")
            elif self.task == "causal-lm":
                # 如果任务是因果语言模型
                common_inputs = OrderedDict(
                    [
                        ("input_ids", {0: "batch", 1: "encoder_sequence"}),
                        ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
                    ]
                )
                # 如果使用过去状态，为每个解码器层设置过去关键值的输入格式
                if self.use_past:
                    _, num_decoder_layers = self.num_layers
                    for i in range(num_decoder_layers):
                        common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
                        common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
            else:
                # 默认情况下，返回完整的输入格式，包括编码器和解码器的输入
                common_inputs = OrderedDict(
                    [
                        ("input_ids", {0: "batch", 1: "encoder_sequence"}),
                        ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
                        ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
                        ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
                    ]
                )
    
            return common_inputs
    
        @property
        # 定义 outputs 属性，返回输出的字典映射
        # 从 transformers.models.bart.configuration_bart.BartOnnxConfig.outputs 复制
        def outputs(self) -> Mapping[str, Mapping[int, str]]:
            # 根据任务类型决定通用输出格式
            if self.task in ["default", "seq2seq-lm"]:
                # 如果任务是默认或者序列到序列语言模型，使用父类的输出
                common_outputs = super().outputs
            else:
                # 否则，使用父类 OnnxConfigWithPast 的输出
                common_outputs = super(OnnxConfigWithPast, self).outputs
                # 如果使用过去状态，为每个编码器层设置现在状态的输出格式
                if self.use_past:
                    num_encoder_layers, _ = self.num_layers
                    for i in range(num_encoder_layers):
                        common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
                        common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
            return common_outputs
    
        # 定义 _generate_dummy_inputs_for_default_and_seq2seq_lm 方法，用于生成默认和序列到序列语言模型的虚拟输入
        def _generate_dummy_inputs_for_default_and_seq2seq_lm(
            self,
            tokenizer: PreTrainedTokenizer,
            batch_size: int = -1,
            seq_length: int = -1,
            is_pair: bool = False,
            framework: Optional[TensorType] = None,
        ) -> Mapping[str, Any]:
        # 生成编码器输入数据
        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, seq_length, is_pair, framework
        )
        
        # 生成解码器输入数据
        # 如果使用过去状态，则解码器序列长度为1，否则与编码器序列长度相同
        decoder_seq_length = seq_length if not self.use_past else 1
        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, decoder_seq_length, is_pair, framework
        )
        
        # 将解码器输入数据的键名前添加"decoder_"前缀，并组成新的字典
        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
        
        # 整合编码器和解码器的输入数据为一个通用的输入字典
        common_inputs = dict(**encoder_inputs, **decoder_inputs)

        # 如果使用过去状态
        if self.use_past:
            # 检查是否可用 PyTorch
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch
            
            # 获取编码器输入的批次大小和序列长度
            batch, encoder_seq_length = common_inputs["input_ids"].shape
            
            # 获取解码器输入的序列长度
            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
            
            # 获取注意力头的数量
            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
            
            # 定义编码器和解码器的张量形状
            encoder_shape = (
                batch,
                num_encoder_attention_heads,
                encoder_seq_length,
                self._config.hidden_size // num_encoder_attention_heads,
            )
            decoder_past_length = decoder_seq_length
            decoder_shape = (
                batch,
                num_decoder_attention_heads,
                decoder_past_length,
                self._config.hidden_size // num_decoder_attention_heads,
            )
            
            # 在解码器注意力掩码末尾添加全1张量，用于模拟过去的键值
            common_inputs["decoder_attention_mask"] = torch.cat(
                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
            )
            
            # 初始化过去的键值列表
            common_inputs["past_key_values"] = []
            
            # 获取解码器的层数
            _, num_decoder_layers = self.num_layers
            
            # 为每一层解码器生成过去的键值对
            for _ in range(num_decoder_layers):
                common_inputs["past_key_values"].append(
                    (
                        torch.zeros(decoder_shape),
                        torch.zeros(decoder_shape),
                        torch.zeros(encoder_shape),
                        torch.zeros(encoder_shape),
                    )
                )
        
        # 返回整合后的通用输入字典
        return common_inputs
    ) -> Mapping[str, Any]:
        # 生成用于序列分类和问答的虚拟输入数据，根据给定的tokenizer、batch_size、seq_length、is_pair和framework参数
        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
            tokenizer, batch_size, seq_length, is_pair, framework
        )

        # 如果使用过去的键值（past_key_values）
        if self.use_past:
            # 如果没有安装PyTorch，则抛出数值错误异常
            if not is_torch_available():
                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
            else:
                import torch
            # 获取输入数据的batch大小和序列长度
            batch, seqlen = common_inputs["input_ids"].shape
            past_key_values_length = seqlen
            # 获取解码器层数
            _, num_decoder_layers = self.num_layers
            # 获取编码器注意力头数和隐藏大小
            num_encoder_attention_heads, _ = self.num_attention_heads
            # 计算过去键值的形状
            past_shape = (
                batch,
                num_encoder_attention_heads,
                past_key_values_length,
                self._config.hidden_size // num_encoder_attention_heads,
            )
            # 获取注意力掩码的数据类型
            mask_dtype = common_inputs["attention_mask"].dtype
            # 扩展注意力掩码，以适应过去键值的长度
            common_inputs["attention_mask"] = torch.cat(
                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
            )
            # 初始化过去键值的占位符列表
            common_inputs["past_key_values"] = [
                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_decoder_layers)
            ]
        # 返回生成的输入数据字典
        return common_inputs

    # 从transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering中复制而来
    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 从OnnxConfig.generate_dummy_inputs中复制而来
        # 根据动态轴（-1）的情况，设置一个固定维度的样本数，以避免ONNX进行的优化
        batch_size = compute_effective_axis_dimension(
            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
        )

        # 根据动态轴（-1）的情况，设置一个固定维度的标记数，以避免ONNX进行的优化
        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
        seq_length = compute_effective_axis_dimension(
            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
        )

        # 根据计算的批次和序列生成虚拟输入数据
        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
        # 使用tokenizer生成字典形式的通用输入数据
        common_inputs = dict(tokenizer(dummy_input, return_tensors=framework))
        # 返回通用输入数据字典
        return common_inputs

    # 从transformers.models.bart.configuration_bart.BartOnnxConfig.generate_dummy_inputs中复制而来
    # 根据任务类型生成虚拟输入数据，并返回一个字典
    def generate_dummy_inputs(
        self,
        tokenizer: PreTrainedTokenizer,
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional[TensorType] = None,
    ) -> Mapping[str, Any]:
        # 如果任务类型是"default"或"seq2seq-lm"，调用适用于这两种任务的生成虚拟输入方法
        if self.task in ["default", "seq2seq-lm"]:
            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )
        # 如果任务类型是"causal-lm"，调用适用于因果语言模型任务的生成虚拟输入方法
        elif self.task == "causal-lm":
            common_inputs = self._generate_dummy_inputs_for_causal_lm(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )
        # 否则，调用适用于序列分类和问答任务的生成虚拟输入方法
        else:
            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
            )

        # 返回生成的公共输入数据字典
        return common_inputs

    # 从BartOnnxConfig._flatten_past_key_values_方法复制而来
    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
        # 如果任务类型是"default"或"seq2seq-lm"，调用父类的_flatten_past_key_values_方法
        if self.task in ["default", "seq2seq-lm"]:
            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
        # 否则，调用带有过去信息的OnnxSeq2SeqConfigWithPast类的父类方法_flatten_past_key_values_
        else:
            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
                flattened_output, name, idx, t
            )

    # 填充包含过去信息的键值对到输入或输出的字典中
    def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
        # 如果方向不是"inputs"或"outputs"，抛出错误
        if direction not in ["inputs", "outputs"]:
            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')

        # 根据方向选择适当的名称
        name = "past_key_values" if direction == "inputs" else "present"
        # 解构元组以获取编码器层数和解码器层数
        _, num_decoder_layers = self.num_layers

        # 定义编码器和解码器的序列名称
        encoder_sequence = "past_encoder_sequence"
        decoder_sequence = "past_decoder_sequence" if direction == "inputs" else "past_decoder_sequence + sequence"

        # 对每个解码器层进行迭代，填充键值对到输入或输出的字典中
        for i in range(num_decoder_layers):
            inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch", 2: decoder_sequence}
            inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch", 2: decoder_sequence}
            inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch", 2: encoder_sequence}
            inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch", 2: encoder_sequence}

`.\models\blenderbot\convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py`

# 设置文件编码为 UTF-8
# 版权声明与许可证信息
# 该脚本受 Apache License, Version 2.0 许可证保护，详见链接
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非法律另有要求或书面同意，本软件是基于"原样"提供的，不提供任何担保或条件，无论是明示的还是暗示的。
# 有关许可证的详细信息，请参阅许可证。
"""Convert Blenderbot checkpoint."""

# 导入必要的库和模块
import argparse  # 用于解析命令行参数
import torch  # 导入 PyTorch 库

# 从 transformers 库中导入 BlenderbotConfig 和 BlenderbotForConditionalGeneration
from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration
# 从 transformers.utils 中导入 logging 模块
from transformers.utils import logging

# 设置日志级别为 info
logging.set_verbosity_info()
# 获取 logger 对象
logger = logging.get_logger(__name__)

# 定义一组模式，用于重命名状态字典的键
PATTERNS = [
    ["attention", "attn"],
    ["encoder_attention", "encoder_attn"],
    ["q_lin", "q_proj"],
    ["k_lin", "k_proj"],
    ["v_lin", "v_proj"],
    ["out_lin", "out_proj"],
    ["norm_embeddings", "layernorm_embedding"],
    ["position_embeddings", "embed_positions"],
    ["embeddings", "embed_tokens"],
    ["ffn.lin", "fc"],
]

# 函数：根据指定规则重命名状态字典的键
def rename_state_dict_key(k):
    # 特殊情况：如果键为 "embeddings.weight"，则重命名为 "shared.weight"
    if k == "embeddings.weight":
        return "shared.weight"

    # 遍历预定义的模式列表，逐一替换匹配的键名
    for parlai_name, hf_name in PATTERNS:
        k = k.replace(parlai_name, hf_name)

    # 根据模型结构进一步重命名 encoder 和 decoder 的特定键
    if k.startswith("encoder"):
        k = k.replace(".attn", ".self_attn")
        k = k.replace("norm1", "self_attn_layer_norm")
        k = k.replace("norm2", "final_layer_norm")
    elif k.startswith("decoder"):
        k = k.replace("norm1", "self_attn_layer_norm")
        k = k.replace("norm2", "encoder_attn_layer_norm")
        k = k.replace("norm3", "final_layer_norm")
    return k

# 函数：根据指定规则重命名 Layernorm 层的键
def rename_layernorm_keys(sd):
    # 定义需要重命名的 Layernorm 层的键列表
    keys = [
        "model.encoder.layernorm_embedding.weight",
        "model.encoder.layernorm_embedding.bias",
        "model.decoder.layernorm_embedding.weight",
        "model.decoder.layernorm_embedding.bias",
    ]
    # 遍历每个键，将 Layernorm 替换为 layer_norm，并进行键值对的映射更新
    for k in keys:
        v = sd.pop(k)  # 弹出旧键对应的值
        new_k = k.replace("layernorm_embedding", "layer_norm")  # 构造新的键名
        assert new_k not in sd  # 断言新键名不在原始字典中
        sd[new_k] = v  # 更新字典中的键值对

# 定义需要忽略的键的列表
IGNORE_KEYS = ["START"]

# 函数：将 Parlai 模型的检查点转换为适合 Blenderbot 结构的检查点
@torch.no_grad()
def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    # 使用 map_location="cpu" 加载模型的检查点数据
    model = torch.load(checkpoint_path, map_location="cpu")
    sd = model["model"]  # 获取模型的状态字典
    cfg = BlenderbotConfig.from_json_file(config_json_path)  # 从 JSON 文件中加载配置信息
    m = BlenderbotForConditionalGeneration(cfg)  # 根据配置创建 Blenderbot 模型实例
    valid_keys = m.model.state_dict().keys()  # 获取 Blenderbot 模型的有效键集合
    failures = []  # 初始化失败列表，用于记录转换过程中的失败情况
    mapping = {}  # 初始化映射字典，用于记录成功映射的键值对
    for k, v in sd.items():
        if k in IGNORE_KEYS:  # 如果键在忽略列表中，则跳过处理
            continue

        new_k = rename_state_dict_key(k)  # 根据预定义规则重命名键名
        if new_k not in valid_keys:  # 如果重命名后的键名不在有效键集合中，记录到失败列表中
            failures.append([k, new_k])
        else:
            mapping[new_k] = v  # 否则，将映射后的键值对添加到映射字典中
    # 如果 cfg.normalize_before 为真，则说明使用 Blenderbot-3B 的检查点。需要将 layernorm_embedding 重命名为 layer_norm
    if cfg.normalize_before:
        # 调用函数 rename_layernorm_keys(sd)，对模型状态字典进行重命名操作
        rename_layernorm_keys(sd)
    
    # 载入模型的状态字典，使用 mapping 进行映射，确保严格匹配
    m.model.load_state_dict(mapping, strict=True)
    
    # 将模型转换为半精度（half precision）
    m.half()
    
    # 将模型保存到指定的 PyTorch dump 文件夹路径中
    m.save_pretrained(pytorch_dump_folder_path)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # Required parameters
    parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin")
    # 添加必需的参数：--src_path，类型为字符串，用法说明为"like blenderbot-model.bin"

    parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.")
    # 添加参数：--save_dir，默认值为"hf_blenderbot"，类型为字符串，用法说明为"Where to save converted model."

    parser.add_argument(
        "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use"
    )
    # 添加参数：--hf_config_json，默认值为"blenderbot-3b-config.json"，类型为字符串，用法说明为"Path to config to use"

    args = parser.parse_args()
    # 解析命令行参数并返回一个命名空间对象 args

    convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json)
    # 调用函数 convert_parlai_checkpoint，传递解析后的参数 args 中的 src_path、save_dir 和 hf_config_json

`.\models\blenderbot\modeling_blenderbot.py`

# coding=utf-8
# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Blenderbot model."""

# 导入必要的库和模块
import copy  # 导入深拷贝函数
import math  # 导入数学函数
import os  # 导入操作系统相关功能
import warnings  # 导入警告模块
from typing import List, Optional, Tuple, Union  # 导入类型提示相关模块

import torch  # 导入PyTorch库
import torch.utils.checkpoint  # 导入PyTorch的checkpoint功能
from torch import nn  # 导入神经网络相关模块
from torch.nn import CrossEntropyLoss  # 导入交叉熵损失函数

# 导入Hugging Face自定义的模块和函数
from ...activations import ACT2FN  # 导入激活函数
from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask  # 导入注意力掩码相关函数
from ...modeling_outputs import (  # 导入模型输出相关类
    BaseModelOutput,
    BaseModelOutputWithPastAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    Seq2SeqLMOutput,
    Seq2SeqModelOutput,
)
from ...modeling_utils import PreTrainedModel  # 导入预训练模型基类
from ...utils import (  # 导入工具函数
    add_end_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel  # 导入小型Blenderbot模型
from .configuration_blenderbot import BlenderbotConfig  # 导入Blenderbot配置类

# 获取日志记录器
logger = logging.get_logger(__name__)

# 文档中使用的配置和检查点名称
_CONFIG_FOR_DOC = "BlenderbotConfig"
_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"

# 预训练的Blenderbot模型存档列表
BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "facebook/blenderbot-3B",
    # See all Blenderbot models at https://huggingface.co/models?filter=blenderbot
]

# 从transformers.models.bart.modeling_bart中复制的函数，将输入的ids向右移动一个token
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    """
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    shifted_input_ids[:, 0] = decoder_start_token_id

    if pad_token_id is None:
        raise ValueError("self.model.config.pad_token_id has to be defined.")
    # 将标签中可能存在的-100值替换为pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    return shifted_input_ids


class BlenderbotLearnedPositionalEmbedding(nn.Embedding):
    """
    This module learns positional embeddings up to a fixed maximum size.
    """

    def __init__(self, num_embeddings: int, embedding_dim: int):
        super().__init__(num_embeddings, embedding_dim)
    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
        """
        `input_ids_shape` is expected to be [bsz x seqlen].
        """
        # 从输入的 input_ids_shape 中获取 batch size (bsz) 和序列长度 (seq_len)
        bsz, seq_len = input_ids_shape[:2]
        
        # 根据 past_key_values_length 和当前序列长度 seq_len 创建位置编码
        positions = torch.arange(
            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
        )
        
        # 调用父类的 forward 方法，并传入位置编码 positions
        return super().forward(positions)
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Blenderbot
class BlenderbotAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        config: Optional[BlenderbotConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim  # 设置注意力机制的嵌入维度
        self.num_heads = num_heads  # 设置多头注意力机制的头数
        self.dropout = dropout  # 设置dropout概率
        self.head_dim = embed_dim // num_heads  # 计算每个注意力头的维度
        self.config = config  # 配置参数对象

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5  # 缩放系数
        self.is_decoder = is_decoder  # 是否为解码器
        self.is_causal = is_causal  # 是否使用因果注意力机制

        # 线性变换层，用于将输入投影到不同的空间
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ):
        BLENDERBOT_ATTENTION_CLASSES = {"eager": BlenderbotAttention}
        # 省略了具体的前向传播逻辑，需根据实际需要填充
        pass


# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Blenderbot, MBART->BLENDERBOT
class BlenderbotEncoderLayer(nn.Module):
    def __init__(self, config: BlenderbotConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 设置编码器层的嵌入维度

        # 创建自注意力层对象，选择不同的注意力实现（根据配置中的_attn_implementation字段）
        self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 自注意力层的LayerNorm
        self.dropout = config.dropout  # 设置dropout概率
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数
        self.activation_dropout = config.activation_dropout  # 激活函数的dropout概率
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)  # 第一个线性层
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)  # 第二个线性层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终的LayerNorm
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_head_mask: torch.Tensor,
        output_attentions: bool = False,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        # 保存输入状态作为残差连接的一部分
        residual = hidden_states
        # 对输入状态进行 Layer normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 使用自注意力机制处理隐藏状态，得到处理后的隐藏状态、注意力权重以及（如果有的话）额外的注意力信息
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        # 对处理后的隐藏状态进行 Dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差连接到处理后的隐藏状态上
        hidden_states = residual + hidden_states

        # 保存上一步操作后的隐藏状态作为残差连接的一部分
        residual = hidden_states
        # 对处理后的隐藏状态再次进行 Layer normalization
        hidden_states = self.final_layer_norm(hidden_states)
        # 应用激活函数到全连接层 fc1 上
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 对处理后的隐藏状态进行 Dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        # 通过全连接层 fc2 得到最终的隐藏状态表示
        hidden_states = self.fc2(hidden_states)
        # 对最终隐藏状态再次进行 Dropout
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        # 将残差连接到处理后的隐藏状态上
        hidden_states = residual + hidden_states

        # 如果隐藏状态的数据类型是 torch.float16，并且包含无穷大或 NaN 值，则进行值的截断处理
        if hidden_states.dtype == torch.float16 and (
            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
        ):
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # 构建输出元组，包含最终的隐藏状态
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将注意力权重也添加到输出元组中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回输出元组
        return outputs
# 从transformers.models.mbart.modeling_mbart.MBartDecoderLayer复制并修改为BlenderbotDecoderLayer，同时将MBart->Blenderbot, MBART->BLENDERBOT
class BlenderbotDecoderLayer(nn.Module):
    def __init__(self, config: BlenderbotConfig):
        super().__init__()
        self.embed_dim = config.d_model  # 设置嵌入维度为配置中的模型维度

        # 创建自注意力机制，根据配置选择不同的实现类
        self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
            embed_dim=self.embed_dim,
            num_heads=config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            is_causal=True,
            config=config,
        )
        self.dropout = config.dropout  # 设置dropout概率
        self.activation_fn = ACT2FN[config.activation_function]  # 激活函数根据配置选择
        self.activation_dropout = config.activation_dropout  # 激活函数的dropout概率

        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 对自注意力输出进行LayerNorm

        # 创建编码器-解码器注意力机制，根据配置选择不同的实现类
        self.encoder_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
            config=config,
        )
        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)  # 对编码器-解码器注意力输出进行LayerNorm

        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)  # 第一个线性层
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)  # 第二个线性层
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)  # 最终输出进行LayerNorm

    # 前向传播函数，接受一系列输入和掩码，执行解码器层的计算
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = True,
    # 此模型继承自 `PreTrainedModel`。请查看超类文档，了解库实现的所有通用方法，如下载或保存模型、调整输入嵌入大小、剪枝头等。

    # 此模型也是 PyTorch 的 `torch.nn.Module` 子类。您可以像使用常规 PyTorch 模块一样使用它，并参考 PyTorch 文档处理一般用法和行为相关事宜。

    # 参数：
    #   config ([`BlenderbotConfig`]):
    #       模型配置类，包含模型的所有参数。使用配置文件初始化不会加载模型的权重，仅加载配置。查看 `~PreTrainedModel.from_pretrained` 方法以加载模型权重。
"""

BLENDERBOT_GENERATION_EXAMPLE = r"""
    Conversation example:

    ```
    >>> from transformers import AutoTokenizer, BlenderbotForConditionalGeneration

    >>> mname = "facebook/blenderbot-400M-distill"
    >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
    >>> tokenizer = AutoTokenizer.from_pretrained(mname)
    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
    >>> print("Human: ", UTTERANCE)
    Human:  My friends are cool but they eat too many carbs.

    >>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
    >>> reply_ids = model.generate(**inputs)
    >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])
    Bot: That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?

    >>> REPLY = "I'm not sure"
    >>> print("Human: ", REPLY)
    Human: I'm not sure

    >>> NEXT_UTTERANCE = (
    ...     "My friends are cool but they eat too many carbs.</s> <s>That's unfortunate. "
    ...     "Are they trying to lose weight or are they just trying to be healthier?</s> "
    ...     "<s> I'm not sure."
    ... )
    >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt")
    >>> next_reply_ids = model.generate(**inputs)
    >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0])
    Bot:   I see. Well, it's good that they're trying to change their eating habits.
    ```
"""

BLENDERBOT_INPUTS_DOCSTRING = r"""
"""


class BlenderbotEncoder(BlenderbotPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`BlenderbotEncoderLayer`].

    Args:
        config: BlenderbotConfig
        embed_tokens (nn.Embedding): output embedding
    """

    def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None):
        # 调用父类的初始化方法
        super().__init__(config)

        # 设置 dropout 和 layerdrop 参数
        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        # 设置 embed_dim 为模型的维度大小，获取 padding_idx 和 max_source_positions
        embed_dim = config.d_model
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        # 如果传入了 embed_tokens，则使用传入的，否则创建新的 Embedding 层
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

        # 创建学习得到的位置编码
        self.embed_positions = BlenderbotLearnedPositionalEmbedding(
            config.max_position_embeddings,
            embed_dim,
        )

        # 创建 encoder 层列表，根据配置文件中的 encoder_layers 数量
        self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)])

        # 创建 LayerNorm 层，输入维度为 config.d_model
        self.layer_norm = nn.LayerNorm(config.d_model)

        # 是否开启渐变检查点，默认为 False
        self.gradient_checkpointing = False

        # 初始化权重并进行最终处理
        self.post_init()
    # 定义模型的前向传播方法，处理输入数据并生成输出结果
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    """
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BlenderbotDecoderLayer`]

    Args:
        config: BlenderbotConfig
        embed_tokens (nn.Embedding): output embedding
    """
    
    # BlenderbotDecoder 类，继承自 BlenderbotPreTrainedModel
    def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None):
        super().__init__(config)
        
        # 设置 dropout 概率
        self.dropout = config.dropout
        # 设置层间 dropout 概率
        self.layerdrop = config.decoder_layerdrop
        # 获取填充 token 的索引
        self.padding_idx = config.pad_token_id
        # 设置最大目标位置数
        self.max_target_positions = config.max_position_embeddings
        # 设置嵌入向量的缩放因子
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
        
        # 如果提供了嵌入 token，则使用提供的；否则创建新的嵌入层
        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
        
        # 创建学习的位置嵌入层
        self.embed_positions = BlenderbotLearnedPositionalEmbedding(
            config.max_position_embeddings,
            config.d_model,
        )
        
        # 创建多个解码层，并放入 ModuleList 中
        self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)])
        # 创建层归一化层
        self.layer_norm = nn.LayerNorm(config.d_model)

        # 是否启用渐变检查点
        self.gradient_checkpointing = False
        
        # 初始化权重并应用最终处理
        self.post_init()

    # 返回输入嵌入层
    def get_input_embeddings(self):
        return self.embed_tokens

    # 设置输入嵌入层
    def set_input_embeddings(self, value):
        self.embed_tokens = value

    # 前向传播函数
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
    # 类方法：从预训练模型加载模型实例
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        # 如果预训练模型名或路径为特定字符串 "facebook/blenderbot-90M"
        if pretrained_model_name_or_path == "facebook/blenderbot-90M":
            # 发出警告，提示该检查点已被弃用，并建议使用相同功能的新检查点
            warnings.warn(
                "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
                " checkpoint `facebook/small_blenderbot-90M` with"
                " `BlenderbotSmallModel.from_pretrained('facebook/small_blenderbot-90M')` instead.",
                FutureWarning,
            )
            # 返回从预训练模型加载的小型 Blenderbot 模型实例
            return BlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path)

        # 否则调用父类的方法加载预训练模型实例
        return super(BlenderbotModel, cls).from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)

    # 返回输入嵌入层对象
    def get_input_embeddings(self):
        return self.shared

    # 设置输入嵌入层对象
    def set_input_embeddings(self, value):
        self.shared = value
        self.encoder.embed_tokens = self.shared
        self.decoder.embed_tokens = self.shared

    # 获取编码器对象
    def get_encoder(self):
        return self.encoder

    # 获取解码器对象
    def get_decoder(self):
        return self.decoder

    # 前向传播方法，添加了模型输入的文档字符串和返回值的替换说明
    @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        decoder_head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
# 添加模型文档字符串，指定Blenderbot模型带有语言建模头部，可用于摘要生成。
@add_start_docstrings(
    "The Blenderbot Model with a language modeling head. Can be used for summarization.",
    BLENDERBOT_START_DOCSTRING
)
# 定义BlenderbotForConditionalGeneration类，继承自BlenderbotPreTrainedModel类。
class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel):
    # 指定基础模型前缀为"model"
    base_model_prefix = "model"
    # 指定在加载过程中忽略的键名列表，缺失时的处理
    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
    # 指定需要绑定权重的键名列表
    _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"]

    # 初始化方法，接受一个BlenderbotConfig类型的配置对象config
    def __init__(self, config: BlenderbotConfig):
        # 调用父类的初始化方法
        super().__init__(config)
        # 创建BlenderbotModel实例，存储于self.model属性中
        self.model = BlenderbotModel(config)
        # 注册一个张量缓冲区"final_logits_bias"，值为全零张量，形状为(1, self.model.shared.num_embeddings)
        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
        # 创建一个线性层self.lm_head，输入特征数为config.d_model，输出特征数为self.model.shared.num_embeddings，无偏置
        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)

        # 执行初始化权重和最终处理
        self.post_init()

    # 类方法，从预训练模型加载模型实例
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        # 如果预训练模型名称或路径为"facebook/blenderbot-90M"，发出警告，并返回BlenderbotSmallForConditionalGeneration类的预训练实例
        if pretrained_model_name_or_path == "facebook/blenderbot-90M":
            warnings.warn(
                "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical"
                " checkpoint `facebook/small_blenderbot-90M` with"
                " `BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')` instead.",
                FutureWarning,
            )
            return BlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)

        # 否则，调用父类的from_pretrained方法，返回预训练模型实例
        return super(BlenderbotForConditionalGeneration, cls).from_pretrained(
            pretrained_model_name_or_path, *model_args, **kwargs
        )

    # 获取编码器的方法，返回self.model的get_encoder()方法结果
    def get_encoder(self):
        return self.model.get_encoder()

    # 获取解码器的方法，返回self.model的get_decoder()方法结果
    def get_decoder(self):
        return self.model.get_decoder()

    # 调整token嵌入大小的方法，接受新的token数量new_num_tokens和可选参数pad_to_multiple_of
    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
        # 调用父类的resize_token_embeddings方法，返回新的嵌入层new_embeddings
        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
        # 调用自身的_resize_final_logits_bias方法，调整final_logits_bias张量大小
        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
        # 返回新的嵌入层new_embeddings
        return new_embeddings

    # 调整final_logits_bias张量大小的方法，接受新的token数量new_num_tokens
    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
        # 获取旧的token数量
        old_num_tokens = self.final_logits_bias.shape[-1]
        # 如果新的token数量小于等于旧的token数量
        if new_num_tokens <= old_num_tokens:
            # 截取final_logits_bias张量，使其列数等于new_num_tokens
            new_bias = self.final_logits_bias[:, :new_num_tokens]
        else:
            # 创建一个额外的零张量，形状为(1, new_num_tokens - old_num_tokens)，设备与final_logits_bias相同
            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
            # 拼接final_logits_bias和extra_bias，扩展final_logits_bias张量
            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
        # 注册新的final_logits_bias张量
        self.register_buffer("final_logits_bias", new_bias)

    # 获取输出嵌入层self.lm_head的方法
    def get_output_embeddings(self):
        return self.lm_head

    # 设置输出嵌入层self.lm_head的方法，接受新的嵌入层new_embeddings
    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    # 添加模型前向方法的文档字符串，应用输入文档字符串装饰器
    @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
    # 替换返回值文档字符串，指定输出类型为Seq2SeqLMOutput，应用配置类_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 添加末尾文档字符串BLENDERBOT_GENERATION_EXAMPLE
    @add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE)
    # 正向传播方法，用于模型的前向推断过程，接受多个输入参数
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,  # 输入的 token IDs，可以为空
        attention_mask: Optional[torch.Tensor] = None,  # 注意力遮罩，可以为空
        decoder_input_ids: Optional[torch.LongTensor] = None,  # 解码器的输入 token IDs，可以为空
        decoder_attention_mask: Optional[torch.LongTensor] = None,  # 解码器的注意力遮罩，可以为空
        head_mask: Optional[torch.Tensor] = None,  # 多头注意力机制的掩码，可以为空
        decoder_head_mask: Optional[torch.Tensor] = None,  # 解码器多头注意力机制的掩码，可以为空
        cross_attn_head_mask: Optional[torch.Tensor] = None,  # 交叉注意力的多头掩码，可以为空
        encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None,  # 编码器的输出，可以为空
        past_key_values: Optional[List[torch.FloatTensor]] = None,  # 过去的键值对，可以为空
        inputs_embeds: Optional[torch.Tensor] = None,  # 输入的嵌入向量，可以为空
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,  # 解码器输入的嵌入向量，可以为空
        labels: Optional[torch.LongTensor] = None,  # 标签，可以为空
        use_cache: Optional[bool] = None,  # 是否使用缓存，可以为空
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重，可以为空
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态，可以为空
        return_dict: Optional[bool] = None,  # 是否返回字典格式的结果，可以为空
    ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:
            A tuple containing either masked language modeling loss and model outputs or just model outputs.

        """
        # Determine whether to use the provided return_dict or the default from configuration
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
            # Issue a warning if use_cache is True when labels are provided, then set use_cache to False
            if use_cache:
                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
            use_cache = False
            
            # If decoder_input_ids and decoder_inputs_embeds are not provided, shift labels to the right for decoder input
            if decoder_input_ids is None and decoder_inputs_embeds is None:
                decoder_input_ids = shift_tokens_right(
                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
                )

        # Pass the inputs to the model for computation
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            encoder_outputs=encoder_outputs,
            decoder_attention_mask=decoder_attention_mask,
            head_mask=head_mask,
            decoder_head_mask=decoder_head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            decoder_inputs_embeds=decoder_inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        
        # Calculate logits for language modeling head and add bias
        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias

        masked_lm_loss = None
        if labels is not None:
            # Compute masked language modeling loss if labels are provided
            loss_fct = CrossEntropyLoss()
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
            # Return output as a tuple if return_dict is False
            output = (lm_logits,) + outputs[1:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # Return Seq2SeqLMOutput if return_dict is True, containing relevant outputs
        return Seq2SeqLMOutput(
            loss=masked_lm_loss,
            logits=lm_logits,
            past_key_values=outputs.past_key_values,
            decoder_hidden_states=outputs.decoder_hidden_states,
            decoder_attentions=outputs.decoder_attentions,
            cross_attentions=outputs.cross_attentions,
            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
            encoder_hidden_states=outputs.encoder_hidden_states,
            encoder_attentions=outputs.encoder_attentions,
        )
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        past_key_values=None,
        attention_mask=None,
        head_mask=None,
        decoder_head_mask=None,
        cross_attn_head_mask=None,
        use_cache=None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 如果使用了过去的键值（past_key_values），则修剪decoder_input_ids
        if past_key_values is not None:
            # 获取过去键值的长度
            past_length = past_key_values[0][0].shape[2]

            # 某些生成方法可能已经只传递了最后一个输入ID
            if decoder_input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：保留仅最后一个ID
                remove_prefix_length = decoder_input_ids.shape[1] - 1

            # 修剪decoder_input_ids，去掉前面部分
            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]

        # 返回准备好的输入字典
        return {
            "input_ids": None,  # encoder_outputs已定义。input_ids不需要
            "encoder_outputs": encoder_outputs,
            "past_key_values": past_key_values,
            "decoder_input_ids": decoder_input_ids,
            "attention_mask": attention_mask,
            "head_mask": head_mask,
            "decoder_head_mask": decoder_head_mask,
            "cross_attn_head_mask": cross_attn_head_mask,
            "use_cache": use_cache,  # 将此项更改以避免缓存（可能是为了调试）
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 缓存的交叉注意力状态无需重新排序 -> 它们始终保持不变
            reordered_past += (
                # 对每一层的过去状态执行重新排序，按beam_idx的顺序重新选择
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                + layer_past[2:],  # 剩余部分保持不变
            )
        # 返回重新排序后的过去键值
        return reordered_past
# 从 `transformers.models.bart.modeling_bart.BartDecoderWrapper` 复制并修改为 Blenderbot 的解码器包装器
class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel):
    """
    这个包装器类是一个辅助类，用于在将因果语言模型与 [`EncoderDecoderModel`] 框架结合使用时正确加载预训练检查点。
    """

    def __init__(self, config):
        super().__init__(config)
        # 初始化 Blenderbot 解码器
        self.decoder = BlenderbotDecoder(config)

    def forward(self, *args, **kwargs):
        # 前向传播到 Blenderbot 解码器
        return self.decoder(*args, **kwargs)


# 从 `transformers.models.bart.modeling_bart.BartForCausalLM` 复制并修改为 Blenderbot 的因果语言模型
# 使用 Bart -> Blenderbot, facebook/bart-base -> facebook/blenderbot-400M-distill
class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        # 深度拷贝配置
        config = copy.deepcopy(config)
        # 设定为解码器
        config.is_decoder = True
        config.is_encoder_decoder = False
        super().__init__(config)
        
        # 使用 BlenderbotDecoderWrapper 初始化模型
        self.model = BlenderbotDecoderWrapper(config)

        # 定义 LM 头部线性层
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 初始化权重并应用最终处理
        self.post_init()

    def get_input_embeddings(self):
        # 返回解码器的嵌入标记
        return self.model.decoder.embed_tokens

    def set_input_embeddings(self, value):
        # 设置解码器的嵌入标记
        self.model.decoder.embed_tokens = value

    def get_output_embeddings(self):
        # 返回 LM 头部
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        # 设置输出嵌入
        self.lm_head = new_embeddings

    def set_decoder(self, decoder):
        # 设置解码器
        self.model.decoder = decoder

    def get_decoder(self):
        # 获取解码器
        return self.model.decoder

    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        # 前向传播，支持因果语言模型的生成
        ...

    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
    ):
        # 为生成准备输入
        ...
        # 如果模型作为编码器-解码器模型的解码器使用，会动态创建解码器注意力遮罩
        if attention_mask is None:
            # 如果未提供注意力遮罩，则创建一个与输入形状相同的全1张量
            attention_mask = input_ids.new_ones(input_ids.shape)

        if past_key_values:
            # 获取过去键值的长度
            past_length = past_key_values[0][0].shape[2]

            # 一些生成方法可能只传递最后一个输入 ID
            if input_ids.shape[1] > past_length:
                # 如果输入 ID 的长度大于过去长度，则移除前缀长度为过去长度
                remove_prefix_length = past_length
            else:
                # 否则，默认保留最后一个 ID
                remove_prefix_length = input_ids.shape[1] - 1

            # 裁剪输入 ID，仅保留后缀部分
            input_ids = input_ids[:, remove_prefix_length:]

        # 返回一个字典，包含输入 ID、注意力遮罩、过去键值、是否使用缓存的信息
        return {
            "input_ids": input_ids,  # 编码器输出已定义，不需要输入 ID
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }

    @staticmethod
    def _reorder_cache(past_key_values, beam_idx):
        reordered_past = ()
        for layer_past in past_key_values:
            # 对每一层的过去状态进行重新排序，根据 beam_idx 重新排列
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past

`.\models\blenderbot\modeling_flax_blenderbot.py`

# coding=utf-8
# 版权 2021 年 Fairseq 作者和 Google Flax 团队作者以及 HuggingFace Inc. 团队。保留所有权利。
#
# 根据 Apache 许可证 2.0 版本（“许可证”）许可；
# 除非符合许可证的规定，否则不得使用此文件。
# 您可以在以下网址获取许可证副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 没有任何明示或暗示的担保或条件。
# 有关特定语言的说明，请参阅许可证。
""" Flax Blenderbot model."""

import math
import random
from functools import partial
from typing import Callable, Optional, Tuple

import flax.linen as nn
import jax
import jax.numpy as jnp
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
from flax.linen import combine_masks, make_causal_mask
from flax.linen.attention import dot_product_attention_weights
from flax.traverse_util import flatten_dict, unflatten_dict
from jax import lax
from jax.random import PRNGKey

from ...modeling_flax_outputs import (
    FlaxBaseModelOutput,
    FlaxBaseModelOutputWithPastAndCrossAttentions,
    FlaxCausalLMOutputWithCrossAttentions,
    FlaxSeq2SeqLMOutput,
    FlaxSeq2SeqModelOutput,
)
from ...modeling_flax_utils import (
    ACT2FN,
    FlaxPreTrainedModel,
    append_call_sample_docstring,
    append_replace_return_docstrings,
    overwrite_call_docstring,
)
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
from .configuration_blenderbot import BlenderbotConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "BlenderbotConfig"
_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill"


BLENDERBOT_START_DOCSTRING = r"""
    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.

    Finally, this model supports inherent JAX features such as:

    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
"""
    Parameters:
        config ([`BlenderbotConfig`]): Model configuration class with all the parameters of the model.
            使用 BlenderbotConfig 类作为参数，这个类包含了模型的所有参数。
            初始化时，仅加载配置文件，并不加载与模型相关的权重。
            若要加载模型的权重，请参考 [`~FlaxPreTrainedModel.from_pretrained`] 方法。
"""

BLENDERBOT_INPUTS_DOCSTRING = r"""
"""


BLENDERBOT_ENCODE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""

BLENDERBOT_DECODE_INPUTS_DOCSTRING = r"""
"""


# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
    """
    Shift input ids one token to the right.

    Args:
        input_ids (jnp.ndarray): Array of input token indices.
        pad_token_id (int): Index of the padding token in the vocabulary.
        decoder_start_token_id (int): Index of the start token for decoder input.

    Returns:
        jnp.ndarray: Shifted input token indices.

    This function shifts the input token indices to the right by one position,
    inserting the decoder start token at the beginning and handling padding tokens.
    """
    shifted_input_ids = jnp.zeros_like(input_ids)  # Initialize an array of zeros with the same shape as input_ids
    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])  # Shift input_ids to the right
    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)  # Set the start token

    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)  # Handle padding tokens
    return shifted_input_ids


# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->Blenderbot
class FlaxBlenderbotAttention(nn.Module):
    """
    Implementation of the attention mechanism in the Blenderbot model.

    Attributes:
        config (BlenderbotConfig): The configuration object for the Blenderbot model.
        embed_dim (int): Dimensionality of the token embeddings.
        num_heads (int): Number of attention heads.
        dropout (float): Dropout probability.
        causal (bool): Whether the attention is causal (for decoding).
        bias (bool): Whether to include bias in the attention computation.
        dtype (jnp.dtype): Data type of the computation (default is jnp.float32).
    """

    config: BlenderbotConfig
    embed_dim: int
    num_heads: int
    dropout: float = 0.0
    causal: bool = False
    bias: bool = True
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
    def setup(self) -> None:
        # 将头维度设置为嵌入维度除以头数
        self.head_dim = self.embed_dim // self.num_heads
        # 检查嵌入维度是否能被头数整除，否则抛出数值错误
        if self.head_dim * self.num_heads != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {self.num_heads})."
            )

        # 定义一个偏函数，用于创建具有固定参数的全连接层
        dense = partial(
            nn.Dense,
            self.embed_dim,
            use_bias=self.bias,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )

        # 创建查询、键、值和输出的全连接层
        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
        self.out_proj = dense()

        # 创建一个dropout层，用于随机失活输入
        self.dropout_layer = nn.Dropout(rate=self.dropout)

        # 如果启用因果注意力，创建一个因果掩码
        if self.causal:
            self.causal_mask = make_causal_mask(
                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
            )

    def _split_heads(self, hidden_states):
        # 将隐藏状态张量重新形状为多头注意力的形状
        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))

    def _merge_heads(self, hidden_states):
        # 将多头注意力的张量重新形状为原始形状
        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))

    @nn.compact
    def _concatenate_to_cache(self, key, value, query, attention_mask):
        """
        This function takes projected key, value states from a single input token and concatenates the states to cached
        states from previous steps. This function is slighly adapted from the official Flax repository:
        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
        """
        # 检测是否通过检查"cache"中的变量"cached_key"存在来初始化
        is_initialized = self.has_variable("cache", "cached_key")
        # 从缓存中初始化或创建一个变量，用于存储缓存关键字
        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
        # 从缓存中初始化或创建一个变量，用于存储缓存数值
        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
        # 从缓存中初始化或创建一个变量，用于存储缓存索引
        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
        
        # 如果已经初始化
        if is_initialized:
            # 获取缓存关键字的维度信息
            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
            # 使用新的1d空间切片更新关键字、数值缓存
            cur_index = cache_index.value
            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
            key = lax.dynamic_update_slice(cached_key.value, key, indices)
            value = lax.dynamic_update_slice(cached_value.value, value, indices)
            # 更新缓存关键字的值
            cached_key.value = key
            # 更新缓存数值的值
            cached_value.value = value
            # 更新缓存索引值
            num_updated_cache_vectors = query.shape[1]
            cache_index.value = cache_index.value + num_updated_cache_vectors
            # 用于缓存的decoder自注意力的因果掩膜：我们的单个查询位置应只关注已经生成和缓存的那些关键字位置，而不是剩余的零元素。
            pad_mask = jnp.broadcast_to(
                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
            )
            attention_mask = combine_masks(pad_mask, attention_mask)
        return key, value, attention_mask

    def __call__(
        self,
        hidden_states: jnp.ndarray,
        key_value_states: Optional[jnp.ndarray] = None,
        attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        deterministic: bool = True,
# 从transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer复制代码，并将MBart->Blenderbot
class FlaxBlenderbotEncoderLayer(nn.Module):
    # 使用BlenderbotConfig配置类
    config: BlenderbotConfig
    # 计算中使用的数据类型为jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 设置函数，初始化编码器层
    def setup(self) -> None:
        # 设定嵌入维度为配置中的d_model值
        self.embed_dim = self.config.d_model
        # 使用FlaxBlenderbotAttention创建自注意力层
        self.self_attn = FlaxBlenderbotAttention(
            config=self.config,
            embed_dim=self.embed_dim,
            num_heads=self.config.encoder_attention_heads,
            dropout=self.config.attention_dropout,
            dtype=self.dtype,
        )
        # 对自注意力输出进行层归一化
        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
        # dropout层，以配置中的dropout率丢弃部分神经元
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
        # 激活函数使用配置中的激活函数类型的对应函数
        self.activation_fn = ACT2FN[self.config.activation_function]
        # 使用激活函数对应的dropout层，以配置中的激活dropout率丢弃部分神经元
        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
        # 第一个全连接层，输出维度为配置中的encoder_ffn_dim
        self.fc1 = nn.Dense(
            self.config.encoder_ffn_dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )
        # 第二个全连接层，输出维度与嵌入维度相同
        self.fc2 = nn.Dense(
            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
        )
        # 最终的层归一化
        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

    # 调用函数，执行编码器层的前向传播
    def __call__(
        self,
        hidden_states: jnp.ndarray,
        attention_mask: jnp.ndarray,
        output_attentions: bool = True,
        deterministic: bool = True,
    ) -> Tuple[jnp.ndarray]:
        # 记录残差连接
        residual = hidden_states
        # 对输入的隐藏状态进行自注意力层的归一化
        hidden_states = self.self_attn_layer_norm(hidden_states)
        # 执行自注意力机制计算
        hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
        # 使用dropout层，丢弃部分计算结果
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 残差连接
        hidden_states = residual + hidden_states

        # 记录残差连接
        residual = hidden_states
        # 最终层归一化
        hidden_states = self.final_layer_norm(hidden_states)
        # 使用激活函数激活第一个全连接层的输出
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 使用激活dropout层，丢弃部分计算结果
        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
        # 第二个全连接层的计算
        hidden_states = self.fc2(hidden_states)
        # 使用dropout层，丢弃部分计算结果
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 残差连接
        hidden_states = residual + hidden_states

        # 输出结果只包含隐藏状态
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将注意力权重也加入输出
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


# 从transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection复制代码，并将Bart->Blenderbot
class FlaxBlenderbotEncoderLayerCollection(nn.Module):
    # 使用BlenderbotConfig配置类
    config: BlenderbotConfig
    # 计算中使用的数据类型为jnp.float32
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 设置函数，初始化编码器层集合
    def setup(self):
        # 创建编码器层列表，包括多个FlaxBlenderbotEncoderLayer实例
        self.layers = [
            FlaxBlenderbotEncoderLayer(self.config, name=str(i), dtype=self.dtype)
            for i in range(self.config.encoder_layers)
        ]
        # 编码器层的层丢弃率，从配置中获取
        self.layerdrop = self.config.encoder_layerdrop
    def __call__(
        self,
        hidden_states,
        attention_mask,
        deterministic: bool = True,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # 如果需要输出注意力权重，则初始化空元组；否则设为None
        all_attentions = () if output_attentions else None
        # 如果需要输出隐藏状态，则初始化空元组；否则设为None
        all_hidden_states = () if output_hidden_states else None

        # 遍历每个编码器层
        for encoder_layer in self.layers:
            # 如果需要输出隐藏状态，则将当前隐藏状态加入元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
            # 添加 LayerDrop 功能（参见 https://arxiv.org/abs/1909.11556 ）
            dropout_probability = random.uniform(0, 1)
            # 如果非确定性计算且随机数小于层丢弃概率，则跳过当前层
            if not deterministic and (dropout_probability < self.layerdrop):
                layer_outputs = (None, None)
            else:
                # 否则，调用当前编码器层的前向传播方法
                layer_outputs = encoder_layer(
                    hidden_states,
                    attention_mask,
                    output_attentions,
                    deterministic,
                )
            # 更新隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果需要输出注意力权重，则将当前层的注意力权重加入元组中
            if output_attentions:
                all_attentions = all_attentions + (layer_outputs[1],)

        # 如果需要输出隐藏状态，则将最终的隐藏状态加入元组中
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # 构建输出元组，包括最终的隐藏状态、所有隐藏状态和所有注意力权重
        outputs = (hidden_states, all_hidden_states, all_attentions)

        # 如果不需要返回字典，则将输出中的None值过滤掉后返回
        if not return_dict:
            return tuple(v for v in outputs if v is not None)

        # 否则，将输出作为 FlaxBaseModelOutput 类的实例返回
        return FlaxBaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
        )
# 从transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer复制代码，并将MBart->Blenderbot
class FlaxBlenderbotDecoderLayer(nn.Module):
    # 定义配置为BlenderbotConfig
    config: BlenderbotConfig
    # 定义数据类型为jnp.float32
    dtype: jnp.dtype = jnp.float32

    # 初始化函数，设置各层和模块
    def setup(self) -> None:
        # 设置嵌入维度为配置中的d_model
        self.embed_dim = self.config.d_model
        # 创建BlenderbotAttention自注意力机制实例
        self.self_attn = FlaxBlenderbotAttention(
            config=self.config,
            embed_dim=self.embed_dim,
            num_heads=self.config.decoder_attention_heads,
            dropout=self.config.attention_dropout,
            causal=True,
            dtype=self.dtype,
        )
        # Dropout层，使用配置中的dropout率
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
        # 激活函数，使用配置中指定的激活函数
        self.activation_fn = ACT2FN[self.config.activation_function]
        # 激活函数的Dropout层，使用配置中的activation_dropout率
        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)

        # Layer normalization层，用于自注意力机制
        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
        # 创建BlenderbotAttention编码器注意力机制实例
        self.encoder_attn = FlaxBlenderbotAttention(
            config=self.config,
            embed_dim=self.embed_dim,
            num_heads=self.config.decoder_attention_heads,
            dropout=self.config.attention_dropout,
            dtype=self.dtype,
        )
        # 编码器注意力机制的Layer normalization层
        self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
        
        # 第一个全连接层，输出维度为配置中的decoder_ffn_dim
        self.fc1 = nn.Dense(
            self.config.decoder_ffn_dim,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )
        # 第二个全连接层，输出维度与嵌入维度相同
        self.fc2 = nn.Dense(
            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
        )
        # 最终的Layer normalization层
        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

    # 调用函数，定义模型的前向计算过程
    def __call__(
        self,
        hidden_states: jnp.ndarray,
        attention_mask: jnp.ndarray,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        output_attentions: bool = True,
        deterministic: bool = True,
        # 继续定义参数
        ) -> Tuple[jnp.ndarray]:
        # 将输入的隐藏状态保存为残差连接的一部分
        residual = hidden_states
        # 对当前隐藏状态进行 Layer Normalization
        hidden_states = self.self_attn_layer_norm(hidden_states)

        # 自注意力机制
        # 调用 self_attn 方法进行自注意力计算，同时返回注意力权重
        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
        )
        # 应用 dropout 层，以防止过拟合
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 将残差连接添加到当前隐藏状态中
        hidden_states = residual + hidden_states

        # 交叉注意力块
        cross_attn_weights = None
        # 如果存在编码器的隐藏状态，则执行以下操作
        if encoder_hidden_states is not None:
            # 将输入的隐藏状态保存为残差连接的一部分
            residual = hidden_states

            # 对当前隐藏状态进行 Layer Normalization
            hidden_states = self.encoder_attn_layer_norm(hidden_states)
            # 调用 encoder_attn 方法进行交叉注意力计算，同时返回注意力权重
            hidden_states, cross_attn_weights = self.encoder_attn(
                hidden_states=hidden_states,
                key_value_states=encoder_hidden_states,
                attention_mask=encoder_attention_mask,
            )
            # 应用 dropout 层，以防止过拟合
            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
            # 将残差连接添加到当前隐藏状态中
            hidden_states = residual + hidden_states

        # 全连接层
        # 将输入的隐藏状态保存为残差连接的一部分
        residual = hidden_states
        # 对当前隐藏状态进行 Layer Normalization
        hidden_states = self.final_layer_norm(hidden_states)
        # 应用激活函数到全连接层的第一层
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        # 应用 dropout 层，以防止过拟合
        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
        # 应用全连接层的第二层
        hidden_states = self.fc2(hidden_states)
        # 应用 dropout 层，以防止过拟合
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
        # 将残差连接添加到当前隐藏状态中
        hidden_states = residual + hidden_states

        # 返回输出结果
        outputs = (hidden_states,)

        # 如果需要输出注意力权重，则将自注意力和交叉注意力的权重添加到输出中
        if output_attentions:
            outputs += (self_attn_weights, cross_attn_weights)

        return outputs
# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection with Bart->Blenderbot
class FlaxBlenderbotDecoderLayerCollection(nn.Module):
    config: BlenderbotConfig
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    def setup(self):
        # Initialize a list of Blenderbot decoder layers based on the provided configuration
        self.layers = [
            FlaxBlenderbotDecoderLayer(self.config, name=str(i), dtype=self.dtype)
            for i in range(self.config.decoder_layers)
        ]
        # Set the layer dropout probability from the configuration
        self.layerdrop = self.config.decoder_layerdrop

    def __call__(
        self,
        hidden_states,
        attention_mask,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        deterministic: bool = True,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
    ):
        # Initialize containers for storing outputs
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None

        # Iterate through each decoder layer
        for decoder_layer in self.layers:
            if output_hidden_states:
                # Store hidden states for potential use in output
                all_hidden_states += (hidden_states,)
                # Implement LayerDrop regularization during training
                # (see https://arxiv.org/abs/1909.11556 for details)
            dropout_probability = random.uniform(0, 1)
            if not deterministic and (dropout_probability < self.layerdrop):
                # Skip computation of the layer outputs based on LayerDrop probability
                layer_outputs = (None, None, None)
            else:
                # Compute outputs of the current decoder layer
                layer_outputs = decoder_layer(
                    hidden_states,
                    attention_mask=attention_mask,
                    encoder_hidden_states=encoder_hidden_states,
                    encoder_attention_mask=encoder_attention_mask,
                    init_cache=init_cache,
                    output_attentions=output_attentions,
                    deterministic=deterministic,
                )

            # Update hidden states with the outputs of the current layer
            hidden_states = layer_outputs[0]
            if output_attentions:
                # Store self-attention outputs if specified
                all_self_attns += (layer_outputs[1],)

                if encoder_hidden_states is not None:
                    # Store cross-attention outputs if specified and encoder_hidden_states is provided
                    all_cross_attentions += (layer_outputs[2],)

        # Store hidden states from the last decoder layer if output_hidden_states is enabled
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        # Prepare outputs based on return_dict flag
        outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]

        if not return_dict:
            # Return outputs as a tuple omitting None values
            return tuple(v for v in outputs if v is not None)

        # Return outputs as a FlaxBaseModelOutputWithPastAndCrossAttentions named tuple
        return FlaxBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
            attentions=all_self_attns,
            cross_attentions=all_cross_attentions,
        )


class FlaxBlenderbotEncoder(nn.Module):
    config: BlenderbotConfig
    embed_tokens: nn.Embed
    # 定义数据类型为 jnp.float32，用于计算的数据类型
    dtype: jnp.dtype = jnp.float32  # the dtype of the computation

    # 执行初始化设置操作
    def setup(self):
        # 初始化一个丢弃层，根据配置中的丢弃率
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)

        # 设置嵌入维度为模型配置中的 d_model
        embed_dim = self.config.d_model
        # 设置填充索引为配置中的 pad_token_id
        self.padding_idx = self.config.pad_token_id
        # 设置最大源序列位置为配置中的 max_position_embeddings
        self.max_source_positions = self.config.max_position_embeddings
        # 如果配置中设置了 scale_embedding，则设置嵌入缩放因子为 embed_dim 的平方根，否则为 1.0
        self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0

        # 初始化位置嵌入层，使用正态分布初始化方法，标准差为配置中的 init_std
        self.embed_positions = nn.Embed(
            self.config.max_position_embeddings,
            embed_dim,
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
        )
        # 初始化编码器层集合
        self.layers = FlaxBlenderbotEncoderLayerCollection(self.config, self.dtype)
        # 初始化层归一化层，数据类型为 self.dtype，设置 epsilon 为 1e-05
        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)

    # 定义对象调用方法
    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        # 获取输入张量的形状
        input_shape = input_ids.shape
        # 将输入张量展平为二维张量，保留最后一个维度
        input_ids = input_ids.reshape(-1, input_shape[-1])

        # 对输入 token 嵌入进行缩放
        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale

        # 获取位置嵌入
        embed_pos = self.embed_positions(position_ids)

        # 将嵌入的 token 和位置加和得到隐藏状态
        hidden_states = inputs_embeds + embed_pos
        # 对隐藏状态应用丢弃层，根据 deterministic 参数确定是否确定性丢弃
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)

        # 对隐藏状态应用编码器层
        outputs = self.layers(
            hidden_states,
            attention_mask,
            deterministic=deterministic,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取编码器层输出的最后一个隐藏状态
        last_hidden_states = outputs[0]
        # 对最后一个隐藏状态应用层归一化
        last_hidden_states = self.layer_norm(last_hidden_states)

        # 如果需要输出隐藏状态，则更新隐藏状态列表中的最后一个元素
        hidden_states = None
        if output_hidden_states:
            hidden_states = outputs[1]
            hidden_states = hidden_states[:-1] + (last_hidden_states,)

        # 如果不需要以字典形式返回结果，则将输出元组化并返回
        if not return_dict:
            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
            return tuple(v for v in outputs if v is not None)

        # 以 FlaxBaseModelOutput 类的形式返回结果，包括最后的隐藏状态、隐藏状态和注意力权重（如果有）
        return FlaxBaseModelOutput(
            last_hidden_state=last_hidden_states,
            hidden_states=hidden_states,
            attentions=outputs.attentions,
        )
class FlaxBlenderbotDecoder(nn.Module):
    config: BlenderbotConfig  # 定义配置对象的类型为 BlenderbotConfig
    embed_tokens: nn.Embed  # 定义嵌入层对象的类型为 nn.Embed
    dtype: jnp.dtype = jnp.float32  # 定义计算过程中的数据类型为 jnp.float32

    def setup(self):
        self.dropout_layer = nn.Dropout(rate=self.config.dropout)  # 初始化丢弃层对象，并设定丢弃率为配置中的 dropout

        embed_dim = self.config.d_model  # 从配置中获取嵌入维度
        self.padding_idx = self.config.pad_token_id  # 获取填充标记的索引
        self.max_target_positions = self.config.max_position_embeddings  # 获取目标位置的最大数
        self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0  # 根据配置计算嵌入比例

        self.embed_positions = nn.Embed(
            self.config.max_position_embeddings,
            embed_dim,
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
        )  # 初始化位置嵌入层对象，设定位置数量、嵌入维度和初始化方式

        self.layers = FlaxBlenderbotDecoderLayerCollection(self.config, self.dtype)  # 初始化解码器层集合对象
        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)  # 初始化层归一化对象，设定数据类型和 epsilon 值

    def __call__(
        self,
        input_ids,
        attention_mask,
        position_ids,
        encoder_hidden_states: Optional[jnp.ndarray] = None,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        init_cache: bool = False,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        input_shape = input_ids.shape  # 获取输入数据形状
        input_ids = input_ids.reshape(-1, input_shape[-1])  # 重塑输入数据的形状为二维矩阵

        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale  # 将输入数据嵌入到嵌入层中，并按比例缩放

        # 嵌入位置信息
        positions = self.embed_positions(position_ids)

        hidden_states = inputs_embeds + positions  # 将输入嵌入向量与位置嵌入相加得到隐藏状态
        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)  # 应用丢弃层

        outputs = self.layers(
            hidden_states,
            attention_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            deterministic=deterministic,
            init_cache=init_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )  # 将隐藏状态传递给解码器层集合对象进行解码

        last_hidden_states = outputs[0]  # 获取最后一个隐藏状态
        last_hidden_states = self.layer_norm(last_hidden_states)  # 应用层归一化到最后一个隐藏状态

        # 更新 `hidden_states` 中应用 `layernorm` 后的最后一个元素
        hidden_states = None
        if output_hidden_states:
            hidden_states = outputs[1]  # 获取所有隐藏状态
            hidden_states = hidden_states[:-1] + (last_hidden_states,)  # 将最后一个隐藏状态添加到列表中

        if not return_dict:
            outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
            return tuple(v for v in outputs if v is not None)

        return FlaxBaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=last_hidden_states,
            hidden_states=hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )  # 返回模型输出对象，包含最后的隐藏状态、所有隐藏状态、注意力分数和交叉注意力分数
# 定义一个自定义的 Flax 模型，继承自 nn.Module
class FlaxBlenderbotModule(nn.Module):
    # 声明类变量 config，类型为 BlenderbotConfig，用于配置模型
    config: BlenderbotConfig
    # 声明类变量 dtype，表示计算时的数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32  # 计算时的数据类型

    # 模型的初始化方法
    def setup(self):
        # 初始化一个共享的嵌入层，vocab_size 和 d_model 来自于 config，初始化方式为正态分布
        self.shared = nn.Embed(
            self.config.vocab_size,
            self.config.d_model,
            embedding_init=jax.nn.initializers.normal(self.config.init_std),
            dtype=self.dtype,
        )

        # 初始化编码器，使用自定义的 FlaxBlenderbotEncoder 类
        self.encoder = FlaxBlenderbotEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
        # 初始化解码器，使用自定义的 FlaxBlenderbotDecoder 类
        self.decoder = FlaxBlenderbotDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared)

    # 获取编码器模块的方法
    def _get_encoder_module(self):
        return self.encoder

    # 获取解码器模块的方法
    def _get_decoder_module(self):
        return self.decoder

    # 定义模型的调用方法，实现模型的前向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask,
        position_ids,
        decoder_position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
    ):
        # 调用编码器进行前向传播
        encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 调用解码器进行前向传播，其中传入编码器的输出作为解码器的输入
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            attention_mask=decoder_attention_mask,
            position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],
            encoder_attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=deterministic,
        )

        # 如果 return_dict 为 False，则返回解码器和编码器的输出
        if not return_dict:
            return decoder_outputs + encoder_outputs

        # 如果 return_dict 为 True，则将解码器和编码器的输出整合成 FlaxSeq2SeqModelOutput 类型的结果并返回
        return FlaxSeq2SeqModelOutput(
            last_hidden_state=decoder_outputs.last_hidden_state,
            decoder_hidden_states=decoder_outputs.hidden_states,
            decoder_attentions=decoder_outputs.attentions,
            cross_attentions=decoder_outputs.cross_attentions,
            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
            encoder_hidden_states=encoder_outputs.hidden_states,
            encoder_attentions=encoder_outputs.attentions,
        )


# 继承自 FlaxPreTrainedModel 的预训练模型基类
class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
    # 配置类为 BlenderbotConfig
    config_class = BlenderbotConfig
    # 模型前缀为 "model"
    base_model_prefix: str = "model"
    # 模块类为 nn.Module，在子类中定义
    module_class: nn.Module = None

    # 模型初始化方法
    def __init__(
        self,
        config: BlenderbotConfig,
        input_shape: Tuple[int] = (1, 1),
        seed: int = 0,
        dtype: jnp.dtype = jnp.float32,
        _do_init: bool = True,
        **kwargs,
        ):
            # 使用给定的配置、数据类型和其他关键字参数初始化模块对象
            module = self.module_class(config=config, dtype=dtype, **kwargs)
            # 调用父类的构造方法初始化对象
            super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)

        def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
            # 初始化输入张量
            input_ids = jnp.zeros(input_shape, dtype="i4")
            # 确保初始化过程适用于 FlaxBlenderbotForSequenceClassificationModule
            input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id)
            attention_mask = jnp.ones_like(input_ids)
            decoder_input_ids = input_ids
            decoder_attention_mask = jnp.ones_like(input_ids)

            batch_size, sequence_length = input_ids.shape
            # 创建位置编码矩阵，形状与输入张量相同
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
            decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

            # 划分随机数生成器为参数和dropout使用的两部分
            params_rng, dropout_rng = jax.random.split(rng)
            rngs = {"params": params_rng, "dropout": dropout_rng}

            # 使用模块的初始化方法初始化模型参数
            random_params = self.module.init(
                rngs,
                input_ids,
                attention_mask,
                decoder_input_ids,
                decoder_attention_mask,
                position_ids,
                decoder_position_ids,
            )["params"]

            if params is not None:
                # 如果提供了参数，则将随机生成的参数与已有参数进行融合
                random_params = flatten_dict(unfreeze(random_params))
                params = flatten_dict(unfreeze(params))
                for missing_key in self._missing_keys:
                    params[missing_key] = random_params[missing_key]
                self._missing_keys = set()
                return freeze(unflatten_dict(params))
            else:
                # 如果未提供参数，则返回随机生成的参数
                return random_params
    @add_start_docstrings(BLENDERBOT_ENCODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BlenderbotConfig)
    def encode(
        self,
        input_ids: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        """
        Encodes input sequences into hidden states using the Blenderbot model.

        Args:
            input_ids (`jnp.ndarray`):
                The input token IDs. Shape `(batch_size, sequence_length)`.
            attention_mask (`Optional[jnp.ndarray]`, optional):
                Mask to avoid performing attention on padding token indices. Shape `(batch_size, sequence_length)`.
            position_ids (`Optional[jnp.ndarray]`, optional):
                Indices of positions for each input token in the sequence. Shape `(batch_size, sequence_length)`.
            output_attentions (`Optional[bool]`, optional):
                Whether to output attentions weights. Default is `None`.
            output_hidden_states (`Optional[bool]`, optional):
                Whether to output hidden states of all layers. Default is `None`.
            return_dict (`Optional[bool]`, optional):
                Whether to return a dictionary instead of a tuple of outputs. Default is `None`.
            train (`bool`, optional):
                Whether the model is in training mode. Default is `False`.
            params (`dict`, optional):
                Optional parameters for model inference.
            dropout_rng (`PRNGKey`, optional):
                Dropout random number generator key for reproducibility.

        Returns:
            `FlaxBaseModelOutput`: A namedtuple with the following fields:
                - last_hidden_state (`jnp.ndarray`):
                    Sequence of hidden states at the output of the last layer of the model. Shape
                    `(batch_size, sequence_length, hidden_size)`.
                - hidden_states (`Optional[Tuple[jnp.ndarray]]`, optional):
                    Sequence of hidden states for all layers. Only returned when `output_hidden_states=True`.
                - attentions (`Optional[Tuple[jnp.ndarray]]`, optional):
                    Attention weights for each layer. Only returned when `output_attentions=True`.
        """
        # Combine the input arguments into a dictionary for model initialization
        input_args = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "output_attentions": output_attentions,
            "output_hidden_states": output_hidden_states,
            "return_dict": return_dict,
            "train": train,
            "params": params,
            "dropout_rng": dropout_rng,
        }

        # Initialize the model using its `init` method with specified input arguments
        initialized_model = self.module.init(
            jax.random.PRNGKey(0),  # Initialize with a fixed PRNGKey for reproducibility
            **input_args,
        )

        # Return the initialized cache from the model's initialization results
        return unfreeze(initialized_model["cache"])
    ):
        r"""
        Returns:

        Example:

        ```
        >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration

        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

        >>> text = "My friends are cool but they eat too many carbs."
        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
        >>> encoder_outputs = model.encode(**inputs)
        ```"""
        # 初始化输出注意力的设置，如果没有传入则使用模型配置中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 初始化输出隐藏状态的设置，如果没有传入则使用模型配置中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 初始化返回字典的设置，如果没有传入则使用模型配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 如果注意力掩码为None，则创建一个全1的掩码数组，形状与输入ids相同
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)
        # 如果位置ids为None，则根据输入ids的形状广播创建位置ids
        if position_ids is None:
            batch_size, sequence_length = input_ids.shape
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 处理任何需要的伪随机数生成器
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 定义_encoder_forward函数，用于对编码模块进行前向传播
        def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
            encode_module = module._get_encoder_module()
            return encode_module(input_ids, attention_mask, position_ids, **kwargs)

        # 调用self.module.apply进行模型的前向传播
        return self.module.apply(
            {"params": params or self.params},  # 参数字典，使用传入的参数或者模型自身的参数
            input_ids=jnp.array(input_ids, dtype="i4"),  # 输入的ids数组，转换为JAX支持的整数数组
            attention_mask=jnp.array(attention_mask, dtype="i4"),  # 注意力掩码数组，转换为JAX支持的整数数组
            position_ids=jnp.array(position_ids, dtype="i4"),  # 位置ids数组，转换为JAX支持的整数数组
            output_attentions=output_attentions,  # 是否输出注意力权重
            output_hidden_states=output_hidden_states,  # 是否输出隐藏状态
            return_dict=return_dict,  # 是否返回字典格式的输出
            deterministic=not train,  # 是否确定性计算，如果非训练状态则确定性计算
            rngs=rngs,  # 伪随机数生成器字典
            method=_encoder_forward,  # 调用的方法，这里为_encoder_forward函数
        )

    @add_start_docstrings(BLENDERBOT_DECODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(
        output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BlenderbotConfig
    )
    # 定义decode函数，用于解码
    def decode(
        self,
        decoder_input_ids,  # 解码器输入的ids
        encoder_outputs,  # 编码器的输出
        encoder_attention_mask: Optional[jnp.ndarray] = None,  # 编码器的注意力掩码
        decoder_attention_mask: Optional[jnp.ndarray] = None,  # 解码器的注意力掩码
        decoder_position_ids: Optional[jnp.ndarray] = None,  # 解码器的位置ids
        past_key_values: dict = None,  # 过去的键值对
        output_attentions: Optional[bool] = None,  # 是否输出注意力权重
        output_hidden_states: Optional[bool] = None,  # 是否输出隐藏状态
        return_dict: Optional[bool] = None,  # 是否返回字典格式的输出
        train: bool = False,  # 是否为训练模式
        params: dict = None,  # 参数字典
        dropout_rng: PRNGKey = None,  # 伪随机数生成器
    @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING)
    def __call__(
        self,
        input_ids: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        decoder_input_ids: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        # 确定是否输出注意力权重，默认从模型配置获取
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 确定是否输出隐藏状态，默认从模型配置获取
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 确定是否返回字典格式，默认从模型配置获取
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 准备编码器的输入
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)
        if position_ids is None:
            batch_size, sequence_length = input_ids.shape
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 准备解码器的输入
        if decoder_input_ids is None:
            # 将输入右移一位，用于生成解码器的输入序列
            decoder_input_ids = shift_tokens_right(
                input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id
            )
        if decoder_attention_mask is None:
            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
        if decoder_position_ids is None:
            batch_size, sequence_length = decoder_input_ids.shape
            decoder_position_ids = jnp.broadcast_to(
                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
            )

        # 如果需要处理随机数生成器（PRNG）
        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}

        # 应用模型的前向传播
        return self.module.apply(
            {"params": params or self.params},
            input_ids=jnp.array(input_ids, dtype="i4"),
            attention_mask=jnp.array(attention_mask, dtype="i4"),
            position_ids=jnp.array(position_ids, dtype="i4"),
            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=not train,  # 确定是否使用确定性推理，根据训练标志
            rngs=rngs,
        )
# 给 FlaxBlenderbotModel 类添加文档字符串，描述其作为不带特定头部的裸 MBart 模型变换器的输出
@add_start_docstrings(
    "The bare MBart Model transformer outputting raw hidden-states without any specific head on top.",
    BLENDERBOT_START_DOCSTRING,
)
class FlaxBlenderbotModel(FlaxBlenderbotPreTrainedModel):
    # 引入 BlenderbotConfig 配置
    config: BlenderbotConfig
    # 计算中使用的数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32
    # 指定模块类为 FlaxBlenderbotModule
    module_class = FlaxBlenderbotModule

# 向 FlaxBlenderbotModel 类附加示例调用文档字符串
append_call_sample_docstring(FlaxBlenderbotModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)


# 从 transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule 复制代码，将 Bart 替换为 Blenderbot
class FlaxBlenderbotForConditionalGenerationModule(nn.Module):
    # 引入 BlenderbotConfig 配置
    config: BlenderbotConfig
    # 计算中使用的数据类型，默认为 jnp.float32
    dtype: jnp.dtype = jnp.float32
    # 偏置初始化器，用于初始化偏置参数
    bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros

    def setup(self):
        # 创建 Blenderbot 模块，并使用给定的配置和数据类型初始化
        self.model = FlaxBlenderbotModule(config=self.config, dtype=self.dtype)
        # 创建语言模型头部，使用全连接层，参数根据模型共享的词汇表大小初始化
        self.lm_head = nn.Dense(
            self.model.shared.num_embeddings,
            use_bias=False,
            dtype=self.dtype,
            kernel_init=jax.nn.initializers.normal(self.config.init_std),
        )
        # 创建最终预测 logits 的偏置项，初始化为零向量
        self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings))

    # 获取编码器模块的方法
    def _get_encoder_module(self):
        return self.model.encoder

    # 获取解码器模块的方法
    def _get_decoder_module(self):
        return self.model.decoder

    # 定义对象调用方法，接收多个输入和控制参数，用于模型的正向传播
    def __call__(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask,
        position_ids,
        decoder_position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
        # 后续还有更多参数...
        ):
            # 使用模型生成输出结果，传入输入张量及相关参数
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                position_ids=position_ids,
                decoder_position_ids=decoder_position_ids,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                deterministic=deterministic,
            )

            # 从模型输出中获取隐藏状态张量
            hidden_states = outputs[0]

            # 如果配置了词嵌入共享，使用共享的嵌入层进行计算
            if self.config.tie_word_embeddings:
                shared_embedding = self.model.variables["params"]["shared"]["embedding"]
                lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
            else:
                # 否则直接使用语言模型头部计算逻辑回归
                lm_logits = self.lm_head(hidden_states)

            # 添加最终逻辑偏置
            lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))

            # 如果不要求返回字典形式的输出，则返回元组
            if not return_dict:
                output = (lm_logits,) + outputs[1:]
                return output

            # 返回 FlaxSeq2SeqLMOutput 类型的输出，包含逻辑回归、解码器隐藏状态和注意力权重等
            return FlaxSeq2SeqLMOutput(
                logits=lm_logits,
                decoder_hidden_states=outputs.decoder_hidden_states,
                decoder_attentions=outputs.decoder_attentions,
                cross_attentions=outputs.cross_attentions,
                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
                encoder_hidden_states=outputs.encoder_hidden_states,
                encoder_attentions=outputs.encoder_attentions,
            )
@add_start_docstrings(
    "The Blenderbot Model with a language modeling head. Can be used for summarization.", BLENDERBOT_START_DOCSTRING
)
class FlaxBlenderbotForConditionalGeneration(FlaxBlenderbotPreTrainedModel):
    module_class = FlaxBlenderbotForConditionalGenerationModule
    dtype: jnp.dtype = jnp.float32

    @add_start_docstrings(BLENDERBOT_DECODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BlenderbotConfig)
    def decode(
        self,
        decoder_input_ids,
        encoder_outputs,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        past_key_values: dict = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        """
        Decode function for generation tasks.

        Args:
        - decoder_input_ids: Input tensor of decoder input token IDs.
        - encoder_outputs: Output from the encoder.
        - encoder_attention_mask: Optional tensor specifying which tokens in the encoder output should not be attended to.
        - decoder_attention_mask: Optional tensor specifying which tokens in the decoder input should not be attended to.
        - decoder_position_ids: Optional tensor specifying the position IDs for the decoder input tokens.
        - past_key_values: Dictionary containing cached key-value states from previous decoding steps.
        - output_attentions: Whether to output attention weights.
        - output_hidden_states: Whether to output hidden states.
        - return_dict: Whether to return a dictionary of outputs.
        - train: Whether the model is in training mode.
        - params: Additional parameters for decoding.
        - dropout_rng: Dropout random number generator key.

        Returns:
        - FlaxCausalLMOutputWithCrossAttentions: Output object containing generated logits and optional cross-attention weights.

        """
        # initializing the cache
        batch_size, seq_length = decoder_input_ids.shape

        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
        # But since the decoder uses a causal mask, those positions are masked anyways.
        # Thus we can create a single static attention_mask here, which is more efficient for compilation
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        if decoder_attention_mask is not None:
            position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
        else:
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        return {
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "encoder_attention_mask": attention_mask,
            "decoder_attention_mask": extended_attention_mask,
            "decoder_position_ids": position_ids,
        }

    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        max_length,
        attention_mask: Optional[jax.Array] = None,
        decoder_attention_mask: Optional[jax.Array] = None,
        encoder_outputs=None,
        **kwargs,
    ):
        """
        Prepares inputs for generation by setting up attention masks and position IDs.

        Args:
        - decoder_input_ids: Tensor of input token IDs for the decoder.
        - max_length: Maximum length of the generated sequence.
        - attention_mask: Optional tensor specifying which tokens in the encoder output should not be attended to.
        - decoder_attention_mask: Optional tensor specifying which tokens in the decoder input should not be attended to.
        - encoder_outputs: Output from the encoder.
        - **kwargs: Additional keyword arguments.

        Returns:
        - dict: Dictionary containing prepared inputs for the generation process.
        """
        # initializing the cache
        batch_size, seq_length = decoder_input_ids.shape

        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
        # But since the decoder uses a causal mask, those positions are masked anyways.
        # Thus we can create a single static attention_mask here, which is more efficient for compilation
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        if decoder_attention_mask is not None:
            position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
        else:
            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))

        return {
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "encoder_attention_mask": attention_mask,
            "decoder_attention_mask": extended_attention_mask,
            "decoder_position_ids": position_ids,
        }

    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        """
        Updates inputs for the generation process by adjusting position IDs and past key values.

        Args:
        - model_outputs: Output from the model.
        - model_kwargs: Keyword arguments for the model.

        Returns:
        - dict: Updated model keyword arguments for generation.
        """
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
        return model_kwargs


FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING = r"""
    Returns:

    Conversation example::

    ```
    >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration
    导入所需的库：从transformers库中导入AutoTokenizer和FlaxBlenderbotForConditionalGeneration类
    
    >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
    使用预训练模型"facebook/blenderbot-400M-distill"初始化生成模型对象model
    
    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
    使用预训练的tokenizer模型"facebook/blenderbot-400M-distill"初始化分词器对象tokenizer
    
    >>> UTTERANCE = "My friends are cool but they eat too many carbs."
    定义待生成回复的文本UTTERANCE
    
    >>> inputs = tokenizer([UTTERANCE], max_length=1024, return_tensors="np")
    使用tokenizer对UTTERANCE进行分词和编码，生成模型输入inputs，设置最大长度为1024，返回类型为numpy数组
    
    >>> # Generate Reply
    生成回复的注释标记
    
    >>> reply_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True).sequences
    调用生成模型生成回复，使用输入的input_ids，设置生成束搜索数为4，最大生成长度为5，启用提前停止策略，并获取生成的序列reply_ids
    
    >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in reply_ids])
    打印生成的回复，对每个生成的序列进行反向分词和解码，并去除特殊标记和清理分词空格
"""
给指定类 `FlaxBlenderbotForConditionalGeneration` 覆盖方法 `call` 的文档字符串，
合并 `BLENDERBOT_INPUTS_DOCSTRING` 和 `FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING`。
"""
overwrite_call_docstring(
    FlaxBlenderbotForConditionalGeneration,
    BLENDERBOT_INPUTS_DOCSTRING + FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING,
)

"""
给指定类 `FlaxBlenderbotForConditionalGeneration` 添加或替换方法 `append_replace_return_docstrings` 的文档字符串，
设置输出类型为 `FlaxSeq2SeqLMOutput`，配置类为 `_CONFIG_FOR_DOC`。
"""
append_replace_return_docstrings(
    FlaxBlenderbotForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
)

posted @ 2024-06-30 15:32 绝不原创的飞龙阅读(99) 评论(0) 收藏举报

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-十八-

Transformers 源码解析（十八）

`.\models\big_bird\init.py`

`.\models\biogpt\configuration_biogpt.py`

`.\models\biogpt\convert_biogpt_original_pytorch_checkpoint_to_pytorch.py`

`.\models\biogpt\modeling_biogpt.py`

`.\models\biogpt\tokenization_biogpt.py`

`.\models\biogpt\init.py`

`.\models\bit\configuration_bit.py`

`.\models\bit\convert_bit_to_pytorch.py`

`.\models\bit\image_processing_bit.py`

`.\models\bit\modeling_bit.py`

`.\models\bit\init.py`

`.\models\blenderbot\configuration_blenderbot.py`

`.\models\blenderbot\convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py`

`.\models\blenderbot\modeling_blenderbot.py`

`.\models\blenderbot\modeling_flax_blenderbot.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-十八-

Transformers 源码解析（十八）

.\models\big_bird\__init__.py

.\models\biogpt\configuration_biogpt.py

.\models\biogpt\convert_biogpt_original_pytorch_checkpoint_to_pytorch.py

.\models\biogpt\modeling_biogpt.py

.\models\biogpt\tokenization_biogpt.py

.\models\biogpt\__init__.py

.\models\bit\configuration_bit.py

.\models\bit\convert_bit_to_pytorch.py

.\models\bit\image_processing_bit.py

.\models\bit\modeling_bit.py

.\models\bit\__init__.py

.\models\blenderbot\configuration_blenderbot.py

.\models\blenderbot\convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py

.\models\blenderbot\modeling_blenderbot.py

.\models\blenderbot\modeling_flax_blenderbot.py

公告

`.\models\big_bird\init.py`

`.\models\biogpt\configuration_biogpt.py`

`.\models\biogpt\convert_biogpt_original_pytorch_checkpoint_to_pytorch.py`

`.\models\biogpt\modeling_biogpt.py`

`.\models\biogpt\tokenization_biogpt.py`

`.\models\biogpt\init.py`

`.\models\bit\configuration_bit.py`

`.\models\bit\convert_bit_to_pytorch.py`

`.\models\bit\image_processing_bit.py`

`.\models\bit\modeling_bit.py`

`.\models\bit\init.py`

`.\models\blenderbot\configuration_blenderbot.py`

`.\models\blenderbot\convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py`

`.\models\blenderbot\modeling_blenderbot.py`

`.\models\blenderbot\modeling_flax_blenderbot.py`