Transformers-源码解析-三-

Transformers 源码解析(三)

.\convert_slow_tokenizers_checkpoints_to_fast.py

# 设置脚本的编码格式为 UTF-8
# 版权声明,此代码归 HuggingFace Inc. 团队所有,使用 Apache 许可证 2.0 版本
#
# 根据许可证,除非符合许可证的要求,否则不能使用此文件
# 可以在以下网址获取许可证的副本:http://www.apache.org/licenses/LICENSE-2.0
#
# 如果适用法律要求或书面同意,本软件根据“现状”分发,不提供任何明示或暗示的保证或条件
# 请查阅许可证了解具体的使用条款和限制
""" 转换慢速分词器检查点为快速分词器的序列化格式(tokenizers 库的序列化格式)"""

# 导入必要的库和模块
import argparse  # 导入命令行参数解析模块
import os  # 导入操作系统功能模块

import transformers  # 导入 transformers 库

from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS  # 从当前目录导入慢速分词器转换器
from .utils import logging  # 从当前目录导入日志记录工具

# 设置日志输出为信息级别
logging.set_verbosity_info()

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 创建一个字典,将慢速分词器名称映射到其对应的快速分词器类
TOKENIZER_CLASSES = {name: getattr(transformers, name + "Fast") for name in SLOW_TO_FAST_CONVERTERS}


def convert_slow_checkpoint_to_fast(tokenizer_name, checkpoint_name, dump_path, force_download):
    # 如果指定的分词器名称不在 TOKENIZER_CLASSES 中,则引发 ValueError
    if tokenizer_name is not None and tokenizer_name not in TOKENIZER_CLASSES:
        raise ValueError(f"Unrecognized tokenizer name, should be one of {list(TOKENIZER_CLASSES.keys())}.")

    # 如果未指定分词器名称,则使用 TOKENIZER_CLASSES 中的所有分词器
    if tokenizer_name is None:
        tokenizer_names = TOKENIZER_CLASSES
    else:
        tokenizer_names = {tokenizer_name: getattr(transformers, tokenizer_name + "Fast")}

    # 记录日志,显示正在加载的分词器类信息
    logger.info(f"Loading tokenizer classes: {tokenizer_names}")
    # 遍历每个给定的分词器名称
    for tokenizer_name in tokenizer_names:
        # 获取与分词器名称对应的分词器类
        tokenizer_class = TOKENIZER_CLASSES[tokenizer_name]

        # 根据是否提供了检查点名称决定加载哪些检查点
        add_prefix = True
        if checkpoint_name is None:
            # 如果未提供检查点名称,则加载所有可用的检查点名称列表
            checkpoint_names = list(tokenizer_class.max_model_input_sizes.keys())
        else:
            # 否则,只加载指定的检查点名称
            checkpoint_names = [checkpoint_name]

        # 记录日志,显示正在加载哪个分词器类的哪些检查点
        logger.info(f"For tokenizer {tokenizer_class.__class__.__name__} loading checkpoints: {checkpoint_names}")

        # 遍历每个指定的检查点名称
        for checkpoint in checkpoint_names:
            # 记录日志,显示正在加载哪个分词器类的哪个具体检查点
            logger.info(f"Loading {tokenizer_class.__class__.__name__} {checkpoint}")

            # 加载分词器对象
            tokenizer = tokenizer_class.from_pretrained(checkpoint, force_download=force_download)

            # 记录日志,显示正在将快速分词器保存到指定路径,并指定前缀和是否添加前缀
            logger.info(f"Save fast tokenizer to {dump_path} with prefix {checkpoint} add_prefix {add_prefix}")

            # 根据检查点名称是否包含斜杠来决定文件保存路径
            if "/" in checkpoint:
                checkpoint_directory, checkpoint_prefix_name = checkpoint.split("/")
                dump_path_full = os.path.join(dump_path, checkpoint_directory)
            elif add_prefix:
                checkpoint_prefix_name = checkpoint
                dump_path_full = dump_path
            else:
                checkpoint_prefix_name = None
                dump_path_full = dump_path

            # 记录日志,显示保存路径和前缀信息
            logger.info(f"=> {dump_path_full} with prefix {checkpoint_prefix_name}, add_prefix {add_prefix}")

            # 检查是否需要添加额外路径,以适应特定的文件保存结构
            if checkpoint in list(tokenizer.pretrained_vocab_files_map.values())[0]:
                file_path = list(tokenizer.pretrained_vocab_files_map.values())[0][checkpoint]
                next_char = file_path.split(checkpoint)[-1][0]
                if next_char == "/":
                    dump_path_full = os.path.join(dump_path_full, checkpoint_prefix_name)
                    checkpoint_prefix_name = None

                # 记录日志,显示最终的保存路径和前缀信息
                logger.info(f"=> {dump_path_full} with prefix {checkpoint_prefix_name}, add_prefix {add_prefix}")

            # 保存预训练模型文件,并返回保存的文件名列表
            file_names = tokenizer.save_pretrained(
                dump_path_full, legacy_format=False, filename_prefix=checkpoint_prefix_name
            )
            # 记录日志,显示保存的文件名列表
            logger.info(f"=> File names {file_names}")

            # 遍历保存的文件列表,删除非tokenizer.json结尾的文件
            for file_name in file_names:
                if not file_name.endswith("tokenizer.json"):
                    os.remove(file_name)
                    logger.info(f"=> removing {file_name}")
if __name__ == "__main__":
    # 如果脚本作为主程序运行,则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必选参数
    parser.add_argument(
        "--dump_path", default=None, type=str, required=True, help="Path to output generated fast tokenizer files."
    )
    # 添加名为 --dump_path 的参数,类型为字符串,必选,用于指定生成的快速标记器文件的输出路径

    parser.add_argument(
        "--tokenizer_name",
        default=None,
        type=str,
        help=(
            f"Optional tokenizer type selected in the list of {list(TOKENIZER_CLASSES.keys())}. If not given, will "
            "download and convert all the checkpoints from AWS."
        ),
    )
    # 添加名为 --tokenizer_name 的参数,类型为字符串,可选,用于选择标记器类型。如果未提供,则将从 AWS 下载并转换所有检查点。

    parser.add_argument(
        "--checkpoint_name",
        default=None,
        type=str,
        help="Optional checkpoint name. If not given, will download and convert the canonical checkpoints from AWS.",
    )
    # 添加名为 --checkpoint_name 的参数,类型为字符串,可选,用于指定检查点的名称。如果未提供,则将从 AWS 下载并转换规范的检查点。

    parser.add_argument(
        "--force_download",
        action="store_true",
        help="Re-download checkpoints.",
    )
    # 添加名为 --force_download 的参数,动作为存储真值,用于强制重新下载检查点。

    args = parser.parse_args()
    # 解析命令行参数并存储到 args 对象中

    convert_slow_checkpoint_to_fast(args.tokenizer_name, args.checkpoint_name, args.dump_path, args.force_download)
    # 调用函数 convert_slow_checkpoint_to_fast,传递参数:标记器名称、检查点名称、输出路径和是否强制重新下载的标志

.\convert_tf_hub_seq_to_seq_bert_to_pytorch.py

# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Seq2Seq TF Hub checkpoint."""


import argparse  # 导入解析命令行参数的模块

from . import (  # 从当前包中导入以下模块
    BertConfig,  # 导入BertConfig类
    BertGenerationConfig,  # 导入BertGenerationConfig类
    BertGenerationDecoder,  # 导入BertGenerationDecoder类
    BertGenerationEncoder,  # 导入BertGenerationEncoder类
    load_tf_weights_in_bert_generation,  # 导入加载TF权重函数
    logging,  # 导入日志模块
)


logging.set_verbosity_info()  # 设置日志输出级别为info


def convert_tf_checkpoint_to_pytorch(tf_hub_path, pytorch_dump_path, is_encoder_named_decoder, vocab_size, is_encoder):
    # Initialise PyTorch model
    bert_config = BertConfig.from_pretrained(  # 从预训练配置中创建BertConfig对象
        "google-bert/bert-large-cased",  # 预训练模型名称
        vocab_size=vocab_size,  # 词汇表大小
        max_position_embeddings=512,  # 最大位置嵌入长度
        is_decoder=True,  # 设置为解码器模式
        add_cross_attention=True,  # 添加交叉注意力机制
    )
    bert_config_dict = bert_config.to_dict()  # 将BertConfig对象转换为字典形式
    del bert_config_dict["type_vocab_size"]  # 删除字典中的"type_vocab_size"键
    config = BertGenerationConfig(**bert_config_dict)  # 使用BertGenerationConfig类和字典初始化config对象
    if is_encoder:
        model = BertGenerationEncoder(config)  # 如果是编码器,创建BertGenerationEncoder模型
    else:
        model = BertGenerationDecoder(config)  # 如果不是编码器,创建BertGenerationDecoder模型
    print(f"Building PyTorch model from configuration: {config}")  # 打印构建PyTorch模型的配置信息

    # Load weights from tf checkpoint
    load_tf_weights_in_bert_generation(  # 载入TF检查点中的权重到模型中
        model,
        tf_hub_path,  # TensorFlow Hub检查点路径
        model_class="bert",  # 模型类别为BERT
        is_encoder_named_decoder=is_encoder_named_decoder,  # 是否将解码器命名为编码器
        is_encoder=is_encoder,  # 是否是编码器模型
    )

    # Save pytorch-model
    print(f"Save PyTorch model and config to {pytorch_dump_path}")  # 打印保存PyTorch模型和配置的路径
    model.save_pretrained(pytorch_dump_path)  # 保存预训练模型到指定路径


if __name__ == "__main__":
    parser = argparse.ArgumentParser()  # 创建参数解析器

    # Required parameters
    parser.add_argument(
        "--tf_hub_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
    )  # 添加tf_hub_path参数,必需,指定TensorFlow检查点路径
    parser.add_argument(
        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
    )  # 添加pytorch_dump_path参数,必需,指定输出PyTorch模型路径
    parser.add_argument(
        "--is_encoder_named_decoder",
        action="store_true",
        help="If decoder has to be renamed to encoder in PyTorch model.",
    )  # 添加is_encoder_named_decoder参数,如果需要在PyTorch模型中将解码器命名为编码器
    parser.add_argument("--is_encoder", action="store_true", help="If model is an encoder.")  # 添加is_encoder参数,如果模型是编码器
    parser.add_argument("--vocab_size", default=50358, type=int, help="Vocab size of model")  # 添加vocab_size参数,默认为50358,模型的词汇表大小
    args = parser.parse_args()  # 解析命令行参数
    convert_tf_checkpoint_to_pytorch(  # 调用转换函数,将TF检查点转换为PyTorch模型
        args.tf_hub_path,
        args.pytorch_dump_path,
        args.is_encoder_named_decoder,
        args.vocab_size,
        is_encoder=args.is_encoder,
    )

.\data\datasets\glue.py

# 引入必要的模块和库
import os
import time
import warnings
from dataclasses import dataclass, field
from enum import Enum
from typing import List, Optional, Union

import torch
from filelock import FileLock
from torch.utils.data import Dataset

# 引入自定义的日志记录模块
from ...utils import logging
# 引入处理 GLUE 任务数据的相关方法和类
from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
from ..processors.utils import InputFeatures

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


@dataclass
class GlueDataTrainingArguments:
    """
    用于指定模型训练和评估所需数据的参数。

    使用 `HfArgumentParser` 可以将此类转换为 argparse 参数,以便能够在命令行上指定它们。
    """

    # GLUE 任务的名称,应与预处理器的键匹配
    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
    # 包含任务数据文件(如 .tsv 文件)的目录
    data_dir: str = field(
        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
    )
    # 在标记化后的最大总输入序列长度,超过此长度的序列将被截断,长度不足的序列将被填充
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    # 是否覆盖缓存的训练和评估数据集
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )

    def __post_init__(self):
        self.task_name = self.task_name.lower()


class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"


class GlueDataset(Dataset):
    """
    此类将很快被一个与框架无关的方法替代。
    """

    # GLUE 数据集的参数
    args: GlueDataTrainingArguments
    # 输出模式
    output_mode: str
    # 输入特征列表
    features: List[InputFeatures]

    def __init__(
        self,
        args: GlueDataTrainingArguments,
        tokenizer: PreTrainedTokenizerBase,
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
        cache_dir: Optional[str] = None,
    ):
        """
        初始化 GLUE 数据集。

        Args:
            args: GlueDataTrainingArguments 类的实例,包含数据集相关参数。
            tokenizer: 预训练的分词器。
            limit_length: 可选参数,限制数据长度。
            mode: 数据集模式,可以是字符串或 Split 枚举类型。
            cache_dir: 可选参数,缓存目录。
        """

        # 设置数据集参数
        self.args = args
        # 设置输出模式
        self.output_mode = glue_output_modes[args.task_name]
        # 从 GLUE 处理器中获取输入特征列表
        self.features = glue_convert_examples_to_features(
            tokenizer=tokenizer,
            examples=None,  # 此处通常传入数据示例
            max_length=args.max_seq_length,
            task=args.task_name,
            label_list=None,  # 此处通常传入标签列表
            output_mode=self.output_mode,
        )

    def __len__(self):
        """
        返回数据集中的样本数量。
        """
        return len(self.features)

    def __getitem__(self, i) -> InputFeatures:
        """
        获取指定索引处的输入特征。

        Args:
            i: 索引值。

        Returns:
            输入特征的实例。
        """
        return self.features[i]

    def get_labels(self):
        """
        返回数据集的标签列表。
        """
        return self.label_list

.\data\datasets\language_modeling.py

# 导入必要的模块和库
import json
import os
import pickle
import random
import time
import warnings
from typing import Dict, List, Optional

import torch
from filelock import FileLock
from torch.utils.data import Dataset

# 导入相对路径的模块和库
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 弃用警告信息
DEPRECATION_WARNING = (
    "This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
    "library. You can have a look at this example script for pointers: {0}"
)

class TextDataset(Dataset):
    """
    这个类将很快被一个与框架无关的方法所取代。
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        cache_dir: Optional[str] = None,
    ):
        warnings.warn(
            DEPRECATION_WARNING.format(
                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
            ),
            FutureWarning,
        )
        # 检查输入的文件路径是否存在,如果不存在则抛出异常
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")

        # 根据tokenizer的特殊token数目,调整block_size的大小
        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)

        # 将文件路径拆分为目录和文件名
        directory, filename = os.path.split(file_path)
        # 设置缓存文件路径,包含模型名称、block_size和文件名等信息
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else directory,
            f"cached_lm_{tokenizer.__class__.__name__}_{block_size}_{filename}",
        )

        # 确保只有分布式训练中的第一个进程处理数据集,其他进程使用缓存
        lock_path = cached_features_file + ".lock"
        # 使用文件锁定确保并发安全性
        with FileLock(lock_path):
            # 如果缓存文件已存在且不需要覆盖,则加载缓存中的特征
            if os.path.exists(cached_features_file) and not overwrite_cache:
                start = time.time()
                with open(cached_features_file, "rb") as handle:
                    self.examples = pickle.load(handle)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )

            else:
                logger.info(f"Creating features from dataset file at {directory}")

                # 初始化self.examples为空列表
                self.examples = []
                # 打开文件并读取文本内容
                with open(file_path, encoding="utf-8") as f:
                    text = f.read()

                # 使用tokenizer将文本分词并转换为对应的token IDs
                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

                # 根据block_size将tokenized_text分割成片段,并构建特征列表self.examples
                for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
                    self.examples.append(
                        tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
                    )
                # 注意,这里为简化起见,最后一个被截断的示例被丢弃了(没有进行填充)
                # 如果你的数据集很小,首先应该寻找更大的数据集,并且你可以通过添加(特定于模型的)填充来更改此行为。

                start = time.time()
                # 将self.examples保存到缓存文件中
                with open(cached_features_file, "wb") as handle:
                    pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
                logger.info(
                    f"Saving features into cached file {cached_features_file} [took {time.time() - start:.3f} s]"
                )

    # 返回self.examples的长度作为数据集的长度
    def __len__(self):
        return len(self.examples)

    # 根据索引返回对应的torch.Tensor对象,包含在self.examples中的数据
    def __getitem__(self, i) -> torch.Tensor:
        return torch.tensor(self.examples[i], dtype=torch.long)
class LineByLineTextDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int):
        # 发出警告,指出此方法即将被不依赖框架的方法取代
        warnings.warn(
            DEPRECATION_WARNING.format(
                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
            ),
            FutureWarning,
        )
        # 检查文件路径是否存在,如果不存在则引发值错误异常
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")
        # 记录消息到日志,指示正在从文件路径创建数据集特征
        logger.info(f"Creating features from dataset file at {file_path}")

        # 使用 utf-8 编码打开文件,读取所有非空行并去除首尾空格
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        # 使用给定的分词器对行进行编码,添加特殊标记并截断到指定长度
        batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
        # 将编码后的输入 IDs 存储在示例中
        self.examples = batch_encoding["input_ids"]
        # 将每个示例封装为包含输入 IDs 的字典,并使用长整型张量进行存储
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

    def __len__(self):
        # 返回示例列表的长度,即数据集中示例的数量
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        # 返回索引为 i 的示例,该示例是包含输入 IDs 的字典
        return self.examples[i]


class LineByLineWithRefDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """
    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ref_path: str):
        # 发出警告,指示代码的某些功能将来会被弃用,并提供了更多信息的链接
        warnings.warn(
            DEPRECATION_WARNING.format(
                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm_wwm.py"
            ),
            FutureWarning,
        )
        # 检查输入文件是否存在,如果不存在则引发 ValueError 异常
        if os.path.isfile(file_path) is False:
            raise ValueError(f"Input file path {file_path} not found")
        # 检查参考文件是否存在,如果不存在则引发 ValueError 异常
        if os.path.isfile(ref_path) is False:
            raise ValueError(f"Ref file path {file_path} not found")
        
        # 不缓存特征,假设很快将在所有地方使用来自 `tokenizers` 仓库的快速多线程分词器
        logger.info(f"Creating features from dataset file at {file_path}")
        logger.info(f"Use ref segment results at {ref_path}")
        
        # 使用 UTF-8 编码打开数据文件,并读取所有行到变量 data 中
        with open(file_path, encoding="utf-8") as f:
            data = f.readlines()  # 使用这种方法避免使用分隔符 '\u2029' 来分割行
        
        # 去除每行两端的空白字符,并排除空行,生成最终的数据列表
        data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]
        
        # 使用 UTF-8 编码打开参考文件,并按行解析每行为 JSON 对象,生成 ref 列表
        with open(ref_path, encoding="utf-8") as f:
            ref = [json.loads(line) for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
        
        # 检查数据列表和参考列表的长度是否一致,如果不一致则引发 ValueError 异常
        if len(data) != len(ref):
            raise ValueError(
                f"Length of Input file should be equal to Ref file. But the length of {file_path} is {len(data)} "
                f"while length of {ref_path} is {len(ref)}"
            )

        # 使用 tokenizer 对数据进行编码处理,添加特殊标记并截断到指定的 block_size
        batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
        
        # 将每个编码后的示例的 "input_ids" 存储为列表的形式,存储在 self.examples 中
        self.examples = batch_encoding["input_ids"]
        
        # 将每个 "input_ids" 转换为包含 torch.tensor 的字典形式,存储在 self.examples 中
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]

        # 为每个示例添加 "chinese_ref" 字段,值为参考数据的 torch.tensor 形式
        n = len(self.examples)
        for i in range(n):
            self.examples[i]["chinese_ref"] = torch.tensor(ref[i], dtype=torch.long)

    def __len__(self):
        # 返回示例列表的长度,用于确定数据集的大小
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        # 根据索引 i 返回对应的示例,为字典形式,包含 "input_ids" 和 "chinese_ref"
        return self.examples[i]
class LineByLineWithSOPTextDataset(Dataset):
    """
    Dataset for sentence order prediction task, prepare sentence pairs for SOP task
    """

    def __init__(self, tokenizer: PreTrainedTokenizer, file_dir: str, block_size: int):
        # 发出警告,提醒此功能即将被弃用,并提供相关链接
        warnings.warn(
            DEPRECATION_WARNING.format(
                "https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py"
            ),
            FutureWarning,
        )
        # 如果提供的文件目录不是一个目录,则引发值错误异常
        if os.path.isdir(file_dir) is False:
            raise ValueError(f"{file_dir} is not a directory")
        # 记录信息,指出正在从指定文件夹创建数据集特征
        logger.info(f"Creating features from dataset file folder at {file_dir}")
        # 初始化空的示例列表
        self.examples = []
        # 遍历文件目录下的每个文件名
        for file_name in os.listdir(file_dir):
            file_path = os.path.join(file_dir, file_name)
            # 如果文件路径不是一个文件,则引发值错误异常
            if os.path.isfile(file_path) is False:
                raise ValueError(f"{file_path} is not a file")
            # 初始化文章打开标志为假
            article_open = False
            # 打开文件,使用UTF-8编码
            with open(file_path, encoding="utf-8") as f:
                # 读取原始行
                original_lines = f.readlines()
                # 初始化文章行列表
                article_lines = []
                # 遍历原始行
                for line in original_lines:
                    # 如果当前行包含"<doc id=",表示文章开始
                    if "<doc id=" in line:
                        article_open = True
                    # 如果当前行包含"</doc>",表示文章结束
                    elif "</doc>" in line:
                        article_open = False
                        # 将文章行列表中第二行开始(排除第一行标题)的每一行转换为token IDs
                        document = [
                            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(line))
                            for line in article_lines[1:]
                            if (len(line) > 0 and not line.isspace())
                        ]
                        # 根据文档创建示例,将其扩展到self.examples列表中
                        examples = self.create_examples_from_document(document, block_size, tokenizer)
                        self.examples.extend(examples)
                        # 清空文章行列表
                        article_lines = []
                    else:
                        # 如果文章正在打开,则将当前行添加到文章行列表中
                        if article_open:
                            article_lines.append(line)

        # 记录信息,指出数据集解析完成
        logger.info("Dataset parse finished.")

    def __len__(self):
        # 返回示例列表的长度
        return len(self.examples)

    def __getitem__(self, i) -> Dict[str, torch.tensor]:
        # 返回指定索引处的示例
        return self.examples[i]


class TextDatasetForNextSentencePrediction(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size: int,
        overwrite_cache=False,
        short_seq_probability=0.1,
        nsp_probability=0.5,
    ):
        # 初始化示例列表为空
        self.examples = []

    def __len__(self):
        # 返回示例列表的长度
        return len(self.examples)

    def __getitem__(self, i):
        # 返回指定索引处的示例
        return self.examples[i]

.\data\datasets\squad.py

# 引入标准库和第三方库
import os
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional, Union

# 引入PyTorch相关库
import torch
from filelock import FileLock
from torch.utils.data import Dataset

# 引入HuggingFace Transformers相关模块
from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features

# 获取日志记录器
logger = logging.get_logger(__name__)

# 获取支持的模型配置类别列表
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())

# 获取支持的模型类型元组
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

# 数据类,定义用于SQuAD数据训练的参数
@dataclass
class SquadDataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    # 模型类型,默认为None,可以选择在支持列表中的任意一个
    model_type: str = field(
        default=None, metadata={"help": "Model type selected in the list: " + ", ".join(MODEL_TYPES)}
    )
    # 数据目录,包含SQuAD任务的.json文件
    data_dir: str = field(
        default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."}
    )
    # 最大输入序列长度,经过标记后的最大总输入序列长度,超出此长度将被截断,较短的将被填充
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    # 文档步幅,将长文档拆分为块时,块之间的步幅
    doc_stride: int = field(
        default=128,
        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
    )
    # 最大问题长度,问题的最大标记数,超出此长度将被截断
    max_query_length: int = field(
        default=64,
        metadata={
            "help": (
                "The maximum number of tokens for the question. Questions longer than this will "
                "be truncated to this length."
            )
        },
    )
    # 最大答案长度,可以生成的答案的最大长度,由于开始和结束预测不受彼此条件的影响
    max_answer_length: int = field(
        default=30,
        metadata={
            "help": (
                "The maximum length of an answer that can be generated. This is needed because the start "
                "and end predictions are not conditioned on one another."
            )
        },
    )
    # 覆盖缓存,是否覆盖缓存的训练和评估集
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    version_2_with_negative: bool = field(
        default=False, metadata={"help": "If true, the SQuAD examples contain some that do not have an answer."}
    )
    # 是否启用 v2 版本的模型,支持 SQuAD 数据集中一些没有答案的情况
    null_score_diff_threshold: float = field(
        default=0.0, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
    )
    # 如果 null_score - best_non_null 大于该阈值,则预测为空值
    n_best_size: int = field(
        default=20, metadata={"help": "If null_score - best_non_null is greater than the threshold predict null."}
    )
    # 生成最佳预测结果的数量上限
    lang_id: int = field(
        default=0,
        metadata={
            "help": (
                "language id of input for language-specific xlm models (see"
                " tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
            )
        },
    )
    # 语言 ID,用于特定语言的 XLM 模型输入(参见 tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)
    threads: int = field(default=1, metadata={"help": "multiple threads for converting example to features"})
    # 转换示例为特征时使用的线程数
class Split(Enum):
    # 定义枚举类 Split,包含 train 和 dev 两个成员
    train = "train"
    dev = "dev"


class SquadDataset(Dataset):
    """
    This will be superseded by a framework-agnostic approach soon.
    """

    args: SquadDataTrainingArguments  # 类型注解,指定 args 为 SquadDataTrainingArguments 类型
    features: List[SquadFeatures]    # 类型注解,指定 features 为 SquadFeatures 类型的列表
    mode: Split                      # 类型注解,指定 mode 为 Split 枚举类型
    is_language_sensitive: bool      # 类型注解,指定 is_language_sensitive 为布尔类型

    def __init__(
        self,
        args: SquadDataTrainingArguments,        # 参数 args,类型为 SquadDataTrainingArguments
        tokenizer: PreTrainedTokenizer,          # 参数 tokenizer,类型为 PreTrainedTokenizer
        limit_length: Optional[int] = None,      # 可选参数 limit_length,类型为整数或 None
        mode: Union[str, Split] = Split.train,   # 参数 mode,类型为字符串或 Split 枚举,默认为 Split.train
        is_language_sensitive: Optional[bool] = False,  # 可选参数 is_language_sensitive,默认为 False
        cache_dir: Optional[str] = None,         # 可选参数 cache_dir,默认为 None
        dataset_format: Optional[str] = "pt",    # 可选参数 dataset_format,默认为 "pt"
    ):
        # 初始化方法,设置实例的属性

    def __len__(self):
        # 返回 features 列表的长度作为数据集的长度

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        # 获取第 i 个样本的特征,并将其转换为包含张量的字典格式返回

        # 从 features 中取出第 i 个特征
        feature = self.features[i]

        # 将特征的各个部分转换为张量
        input_ids = torch.tensor(feature.input_ids, dtype=torch.long)
        attention_mask = torch.tensor(feature.attention_mask, dtype=torch.long)
        token_type_ids = torch.tensor(feature.token_type_ids, dtype=torch.long)
        cls_index = torch.tensor(feature.cls_index, dtype=torch.long)
        p_mask = torch.tensor(feature.p_mask, dtype=torch.float)
        is_impossible = torch.tensor(feature.is_impossible, dtype=torch.float)

        # 构建输入字典
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }

        # 根据模型类型调整输入字典的内容
        if self.args.model_type in ["xlm", "roberta", "distilbert", "camembert"]:
            del inputs["token_type_ids"]

        if self.args.model_type in ["xlnet", "xlm"]:
            inputs.update({"cls_index": cls_index, "p_mask": p_mask})
            if self.args.version_2_with_negative:
                inputs.update({"is_impossible": is_impossible})
            if self.is_language_sensitive:
                inputs.update({"langs": (torch.ones(input_ids.shape, dtype=torch.int64) * self.args.lang_id)})

        # 如果模式是训练模式,添加起始位置和结束位置到输入字典中
        if self.mode == Split.train:
            start_positions = torch.tensor(feature.start_position, dtype=torch.long)
            end_positions = torch.tensor(feature.end_position, dtype=torch.long)
            inputs.update({"start_positions": start_positions, "end_positions": end_positions})

        # 返回构建好的输入字典
        return inputs

.\data\datasets\__init__.py

# 引入自定义模块中的不同数据集类和训练参数类

from .glue import GlueDataset, GlueDataTrainingArguments
from .language_modeling import (
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    LineByLineWithSOPTextDataset,
    TextDataset,
    TextDatasetForNextSentencePrediction,
)
from .squad import SquadDataset, SquadDataTrainingArguments

.\data\data_collator.py

# 导入必要的模块和类
import random  # 导入随机数模块
import warnings  # 导入警告模块
from collections.abc import Mapping  # 从collections.abc模块导入Mapping类
from dataclasses import dataclass  # 导入dataclass装饰器
from random import randint  # 从random模块导入randint函数
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union  # 导入多种类型声明

import numpy as np  # 导入NumPy模块

from ..models.bert import BertTokenizer, BertTokenizerFast  # 从上级目录的models.bert模块导入BertTokenizer和BertTokenizerFast类
from ..tokenization_utils_base import PreTrainedTokenizerBase  # 从上级目录的tokenization_utils_base模块导入PreTrainedTokenizerBase类
from ..utils import PaddingStrategy  # 从上级目录的utils模块导入PaddingStrategy类

InputDataClass = NewType("InputDataClass", Any)  # 定义新类型InputDataClass

"""
A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
of PyTorch/TensorFlow tensors or NumPy arrays.
"""
DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])  # 定义新类型DataCollator


class DataCollatorMixin:
    def __call__(self, features, return_tensors=None):
        # 确定返回的张量类型,默认与实例的return_tensors属性相同
        if return_tensors is None:
            return_tensors = self.return_tensors
        # 如果返回类型为"tf",调用tf_call方法处理features
        if return_tensors == "tf":
            return self.tf_call(features)
        # 如果返回类型为"pt",调用torch_call方法处理features
        elif return_tensors == "pt":
            return self.torch_call(features)
        # 如果返回类型为"np",调用numpy_call方法处理features
        elif return_tensors == "np":
            return self.numpy_call(features)
        else:
            # 如果返回类型不是预期的类型,抛出值错误异常
            raise ValueError(f"Framework '{return_tensors}' not recognized!")


def pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs):
    """
    Pads without triggering the warning about how using the pad function is sub-optimal when using a fast tokenizer.
    """
    
    # 避免使用快速分词器时触发的填充警告
    # 如果tokenizer没有deprecation_warnings属性,直接调用pad方法进行填充
    if not hasattr(tokenizer, "deprecation_warnings"):
        return tokenizer.pad(*pad_args, **pad_kwargs)

    # 保存警告状态,并且禁用相关警告
    warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)
    tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

    try:
        # 调用tokenizer的pad方法进行填充
        padded = tokenizer.pad(*pad_args, **pad_kwargs)
    finally:
        # 恢复警告状态
        tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state

    return padded


def default_data_collator(features: List[InputDataClass], return_tensors="pt") -> Dict[str, Any]:
    """
    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
    """
    """
    potential keys named:

        - `label`: handles a single value (int or float) per object
        - `label_ids`: handles a list of values per object

    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
    to the model. See glue and ner for example of how it's useful.
    """
    
    # In this function we'll make the assumption that all `features` in the batch
    # have the same attributes.
    # So we will look at the first element as a proxy for what attributes exist
    # on the whole batch.
    
    # 根据 `return_tensors` 参数的值选择合适的数据收集器函数并返回结果

    if return_tensors == "pt":
        # 如果 `return_tensors` 是 "pt",则使用 PyTorch 默认的数据收集器
        return torch_default_data_collator(features)
    elif return_tensors == "tf":
        # 如果 `return_tensors` 是 "tf",则使用 TensorFlow 默认的数据收集器
        return tf_default_data_collator(features)
    elif return_tensors == "np":
        # 如果 `return_tensors` 是 "np",则使用 NumPy 默认的数据收集器
        return numpy_default_data_collator(features)
@dataclass
class DefaultDataCollator(DataCollatorMixin):
    """
    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
    potential keys named:

        - `label`: handles a single value (int or float) per object
        - `label_ids`: handles a list of values per object

    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
    to the model. See glue and ner for example of how it's useful.

    This is an object (like other data collators) rather than a pure function like default_data_collator. This can be
    helpful if you need to set a return_tensors value at initialization.

    Args:
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]:
        # If return_tensors is not provided, default to the value set during initialization
        if return_tensors is None:
            return_tensors = self.return_tensors
        # Call the default_data_collator function with the specified return_tensors value
        return default_data_collator(features, return_tensors)


def torch_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
    import torch

    # If features list contains objects that are not mappings, convert them to dictionaries
    if not isinstance(features[0], Mapping):
        features = [vars(f) for f in features]
    # Retrieve the first feature dictionary
    first = features[0]
    # Initialize an empty batch dictionary
    batch = {}

    # Special handling for labels.
    # Ensure that tensor is created with the correct type
    # (it should be automatically the case, but let's make sure of it.)
    if "label" in first and first["label"] is not None:
        # Extract the label value and determine its dtype (long or float)
        label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
        dtype = torch.long if isinstance(label, int) else torch.float
        # Create a tensor batch["labels"] containing labels from all features
        batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
    elif "label_ids" in first and first["label_ids"] is not None:
        # Handle case where label_ids are present
        if isinstance(first["label_ids"], torch.Tensor):
            batch["labels"] = torch.stack([f["label_ids"] for f in features])
        else:
            dtype = torch.long if isinstance(first["label_ids"][0], int) else torch.float
            batch["labels"] = torch.tensor([f["label_ids"] for f in features], dtype=dtype)

    # Handling of all other possible keys.
    # Again, we will use the first element to figure out which key/values are not None for this model.
    for k, v in first.items():
        # Process each key-value pair in the first feature dictionary
        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
            if isinstance(v, torch.Tensor):
                batch[k] = torch.stack([f[k] for f in features])
            elif isinstance(v, np.ndarray):
                batch[k] = torch.tensor(np.stack([f[k] for f in features]))
            else:
                batch[k] = torch.tensor([f[k] for f in features])

    return batch


def tf_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
    import tensorflow as tf

    # This function is intended to collate data for TensorFlow, but its implementation is incomplete.
    # Further code would handle collation similar to the torch_default_data_collator.
    pass
    # 检查 features 列表中第一个元素是否是 Mapping 类型(字典类型)
    if not isinstance(features[0], Mapping):
        # 如果不是 Mapping 类型,则将 features 中的每个元素转换为字典类型
        features = [vars(f) for f in features]
    
    # 获取 features 中的第一个元素
    first = features[0]
    
    # 初始化空字典 batch
    batch = {}

    # 处理标签数据的特殊情况。
    # 确保使用正确的数据类型创建张量
    # (通常应该自动处理,但我们需要确保这一点。)
    if "label" in first and first["label"] is not None:
        label_col_name = "label"
    elif "label_ids" in first and first["label_ids"] is not None:
        label_col_name = "label_ids"
    elif "labels" in first and first["labels"] is not None:
        label_col_name = "labels"
    else:
        label_col_name = None
    
    # 如果存在标签列名
    if label_col_name is not None:
        # 根据第一个元素的标签数据类型,确定 dtype
        if isinstance(first[label_col_name], tf.Tensor):
            dtype = tf.int64 if first[label_col_name].dtype.is_integer else tf.float32
        elif isinstance(first[label_col_name], np.ndarray) or isinstance(first[label_col_name], np.generic):
            dtype = tf.int64 if np.issubdtype(first[label_col_name].dtype, np.integer) else tf.float32
        elif isinstance(first[label_col_name], (tuple, list)):
            dtype = tf.int64 if isinstance(first[label_col_name][0], int) else tf.float32
        else:
            dtype = tf.int64 if isinstance(first[label_col_name], int) else tf.float32
        
        # 将 features 中的标签数据转换为张量,存储在 batch 中的 "labels" 键下
        batch["labels"] = tf.convert_to_tensor([f[label_col_name] for f in features], dtype=dtype)
    
    # 处理除标签以外的所有可能键。
    # 再次使用第一个元素来确定哪些键/值对在此模型中不为 None。
    for k, v in first.items():
        if k not in ("label", "label_ids", "labels") and v is not None and not isinstance(v, str):
            # 如果值是张量或者 numpy 数组,则将 features 中的相应值堆叠为张量
            if isinstance(v, (tf.Tensor, np.ndarray)):
                batch[k] = tf.stack([f[k] for f in features])
            else:
                # 否则,将 features 中的相应值转换为张量
                batch[k] = tf.convert_to_tensor([f[k] for f in features])

    # 返回构建好的 batch 字典
    return batch
# 根据输入特征列表创建批处理数据字典,适用于 NumPy 默认数据格式
def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
    # 如果第一个特征不是映射类型,则将每个特征对象转换为其变量字典表示
    if not isinstance(features[0], Mapping):
        features = [vars(f) for f in features]
    # 获取第一个特征对象
    first = features[0]
    # 初始化批处理字典
    batch = {}

    # 处理标签的特殊情况
    # 确保使用正确类型创建张量
    # (虽然通常应该自动转换,但我们还是确保类型正确)
    if "label" in first and first["label"] is not None:
        # 如果标签是 NumPy 数组,则将其转换为标量
        label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"]
        # 确定标签数据类型
        dtype = np.int64 if isinstance(label, int) else np.float32
        batch["labels"] = np.array([f["label"] for f in features], dtype=dtype)
    elif "label_ids" in first and first["label_ids"] is not None:
        # 如果标签 IDs 是 NumPy 数组,则堆叠它们
        if isinstance(first["label_ids"], np.ndarray):
            batch["labels"] = np.stack([f["label_ids"] for f in features])
        else:
            # 否则,确定标签 IDs 的数据类型
            dtype = np.int64 if isinstance(first["label_ids"][0], int) else np.float32
            batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)

    # 处理所有其他可能的键
    # 再次使用第一个元素确定该模型中哪些键/值不为 None
    for k, v in first.items():
        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
            if isinstance(v, np.ndarray):
                # 如果值是 NumPy 数组,则堆叠它们
                batch[k] = np.stack([f[k] for f in features])
            else:
                # 否则,将值转换为数组
                batch[k] = np.array([f[k] for f in features])

    # 返回批处理数据字典
    return batch
    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            用于编码数据的分词器。
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            选择一种策略来对返回的序列进行填充(根据模型的填充位置和填充索引),可选值包括:
            
            - `True` 或 `'longest'`(默认):对批次中最长的序列进行填充(如果只提供一个序列,则不进行填充)。
            - `'max_length'`:按照参数 `max_length` 指定的最大长度进行填充,或者如果未提供该参数,则按照模型可接受的最大输入长度进行填充。
            - `False` 或 `'do_not_pad'`:不进行填充(即可以输出长度不同的序列批次)。
        max_length (`int`, *optional*):
            返回列表的最大长度,也可选用于填充长度(见上文)。
        pad_to_multiple_of (`int`, *optional*):
            如果设置,将序列填充到提供的值的倍数。
            
            这对于在具有计算能力 >= 7.5(Volta)的 NVIDIA 硬件上启用 Tensor Core 特别有用。
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            要返回的张量类型。允许的值有 "np"、"pt" 和 "tf"。
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # 调用 pad_without_fast_tokenizer_warning 函数进行批量填充
        batch = pad_without_fast_tokenizer_warning(
            self.tokenizer,
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        # 如果 batch 中有 "label" 键,将其重命名为 "labels",并删除 "label"
        if "label" in batch:
            batch["labels"] = batch["label"]
            del batch["label"]
        # 如果 batch 中有 "label_ids" 键,将其重命名为 "labels",并删除 "label_ids"
        if "label_ids" in batch:
            batch["labels"] = batch["label_ids"]
            del batch["label_ids"]
        # 返回处理后的 batch 字典
        return batch
@dataclass
class DataCollatorForTokenClassification(DataCollatorMixin):
    """
    Data collator that will dynamically pad the inputs received, as well as the labels.

    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: PreTrainedTokenizerBase  # Tokenizer对象,用于数据编码
    padding: Union[bool, str, PaddingStrategy] = True  # 序列填充策略:可以是布尔值、字符串或填充策略对象,默认为True
    max_length: Optional[int] = None  # 返回列表的最大长度及填充长度(可选)
    pad_to_multiple_of: Optional[int] = None  # 如果设置,将序列填充到提供的值的倍数(可选)
    label_pad_token_id: int = -100  # 标签填充时使用的ID,默认为-100,PyTorch损失函数会自动忽略这些ID
    return_tensors: str = "pt"  # 返回的Tensor类型,默认为"pt",可选值有"np"、"pt"和"tf"
    # 定义一个使用 Torch 处理特征的方法
    def torch_call(self, features):
        # 导入 PyTorch 库
        import torch

        # 确定标签名称是 "label" 还是 "labels"
        label_name = "label" if "label" in features[0].keys() else "labels"
        # 如果特征中包含标签,提取所有标签值到列表;否则设置标签为 None
        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None

        # 剔除特征中的标签,生成没有标签的特征字典列表
        no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]

        # 使用自定义函数进行填充(此处函数的具体实现未显示),生成批处理数据
        batch = pad_without_fast_tokenizer_warning(
            self.tokenizer,
            no_labels_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",  # 返回 PyTorch 张量
        )

        # 如果没有标签,直接返回批处理数据
        if labels is None:
            return batch

        # 获取输入序列的长度
        sequence_length = batch["input_ids"].shape[1]
        # 获取填充的位置(左或右)
        padding_side = self.tokenizer.padding_side

        # 定义一个函数,将张量或可迭代对象转换为列表
        def to_list(tensor_or_iterable):
            if isinstance(tensor_or_iterable, torch.Tensor):
                return tensor_or_iterable.tolist()
            return list(tensor_or_iterable)

        # 根据填充的位置对标签进行填充
        if padding_side == "right":
            batch[label_name] = [
                to_list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
            ]
        else:
            batch[label_name] = [
                [self.label_pad_token_id] * (sequence_length - len(label)) + to_list(label) for label in labels
            ]

        # 将填充后的标签转换为 PyTorch 的 int64 类型张量
        batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
        return batch

    # 定义一个使用 TensorFlow 处理特征的方法
    def tf_call(self, features):
        # 导入 TensorFlow 库
        import tensorflow as tf

        # 确定标签名称是 "label" 还是 "labels"
        label_name = "label" if "label" in features[0].keys() else "labels"
        # 如果特征中包含标签,提取所有标签值到列表;否则设置标签为 None
        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None

        # 使用自定义函数进行填充(此处函数的具体实现未显示),生成批处理数据
        batch = pad_without_fast_tokenizer_warning(
            self.tokenizer,
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            # 如果标签为 None,则返回 TensorFlow 张量;否则不进行转换
            return_tensors="tf" if labels is None else None,
        )

        # 如果没有标签,直接返回批处理数据
        if labels is None:
            return batch

        # 获取输入序列的长度
        sequence_length = tf.convert_to_tensor(batch["input_ids"]).shape[1]
        # 获取填充的位置(左或右)
        padding_side = self.tokenizer.padding_side

        # 根据填充的位置对标签进行填充
        if padding_side == "right":
            batch["labels"] = [
                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
            ]
        else:
            batch["labels"] = [
                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
            ]

        # 将填充后的标签转换为 TensorFlow 的 int64 类型张量,并将批处理字典中的所有值转换为 TensorFlow 张量
        batch = {k: tf.convert_to_tensor(v, dtype=tf.int64) for k, v in batch.items()}
        return batch
    # 定义一个方法,用于处理特征数据并返回一个批次的 numpy 数组
    def numpy_call(self, features):
        # 确定标签名称是 "label" 还是 "labels"
        label_name = "label" if "label" in features[0].keys() else "labels"
        # 如果特征中包含标签信息,则从每个特征中提取标签,否则设为 None
        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
        # 使用自定义的 pad_without_fast_tokenizer_warning 函数对特征进行填充,转换成批次
        batch = pad_without_fast_tokenizer_warning(
            self.tokenizer,
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            # 如果没有标签信息,则返回的批次为 numpy 数组
            return_tensors="np" if labels is None else None,
        )

        # 如果特征中没有标签信息,则直接返回批次
        if labels is None:
            return batch

        # 计算输入序列长度
        sequence_length = np.array(batch["input_ids"]).shape[1]
        # 获取填充位置(左侧或右侧)
        padding_side = self.tokenizer.padding_side
        # 根据填充位置,为每个标签添加填充标记,使它们长度相同
        if padding_side == "right":
            batch["labels"] = [
                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
            ]
        else:
            batch["labels"] = [
                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
            ]

        # 将批次中的每个键值对的值转换为 numpy 数组,类型为 int64
        batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
        # 返回处理后的批次
        return batch
def _torch_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
    import torch

    # Tensorize if necessary.
    if isinstance(examples[0], (list, tuple, np.ndarray)):
        examples = [torch.tensor(e, dtype=torch.long) for e in examples]

    length_of_first = examples[0].size(0)

    # Check if padding is necessary.
    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
        return torch.stack(examples, dim=0)

    # If yes, check if we have a `pad_token`.
    if tokenizer._pad_token is None:
        raise ValueError(
            "You are attempting to pad samples but the tokenizer you are using"
            f" ({tokenizer.__class__.__name__}) does not have a pad token."
        )

    # Creating the full tensor and filling it with our data.
    max_length = max(x.size(0) for x in examples)
    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
    result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
    for i, example in enumerate(examples):
        if tokenizer.padding_side == "right":
            result[i, : example.shape[0]] = example
        else:
            result[i, -example.shape[0] :] = example
    return result


def _tf_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
    import tensorflow as tf

    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
    # Tensorize if necessary.
    if isinstance(examples[0], (list, tuple)):
        examples = [tf.convert_to_tensor(e, dtype=tf.int64) for e in examples]

    # Check if padding is necessary.
    length_of_first = len(examples[0])
    are_tensors_same_length = all(len(x) == length_of_first for x in examples)
    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
        return tf.stack(examples, axis=0)

    # If yes, check if we have a `pad_token`.
    if tokenizer._pad_token is None:
        raise ValueError(
            "You are attempting to pad samples but the tokenizer you are using"
            f" ({tokenizer.__class__.__name__}) does not have a pad token."
        )

    # Creating the full tensor and filling it with our data.
    max_length = max(len(x) for x in examples)
    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
    
    # Prepare paddings based on tensor rank.
    result = []
    rank = tf.rank(examples[0])
    paddings = np.zeros((rank, 2), dtype=np.int32)
    # 遍历给定的示例列表
    for example in examples:
        # 检查分词器的填充位置是否在右侧
        if tokenizer.padding_side == "right":
            # 如果填充在右侧,计算需要填充的长度并更新填充数组的第一行第二列
            paddings[0, 1] = max_length - len(example)
        else:
            # 如果填充在左侧,计算需要填充的长度并更新填充数组的第一行第一列
            paddings[0, 0] = max_length - len(example)
        # 使用 TensorFlow 的填充函数对示例进行填充,使用填充数组和分词器的填充标记值
        result.append(tf.pad(example, paddings, constant_values=tokenizer.pad_token_id))
    # 将填充后的示例堆叠成一个张量,沿着第一个维度(批次维度)
    return tf.stack(result, axis=0)
# 定义一个数据收集器,用于序列到序列模型的数据处理
@dataclass
class DataCollatorForSeq2Seq:
    """
    Data collator that will dynamically pad the inputs received, as well as the labels.
    """
    # 定义函数参数和类型注解,说明函数的输入参数和可选参数
    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            用于对数据进行编码的分词器。
        model ([`PreTrainedModel`], *optional*):
            正在训练的模型。如果设置并且具有 *prepare_decoder_input_ids_from_labels* 方法,
            则使用它来准备 *decoder_input_ids*。
            
            当使用 *label_smoothing* 时,这很有用,可以避免重复计算损失。
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            选择一种策略来填充返回的序列(根据模型的填充方向和填充索引),可选值包括:
    
            - `True` 或 `'longest'`(默认):填充到批次中最长的序列(如果只提供单个序列,则不填充)。
            - `'max_length'`:填充到指定的最大长度(通过参数 `max_length` 提供),或者如果未提供该参数,则填充到模型的最大可接受输入长度。
            - `False` 或 `'do_not_pad'`:不填充(即可以输出具有不同长度的序列批次)。
        max_length (`int`, *optional*):
            返回列表的最大长度,也可以作为填充长度(参见上文)。
        pad_to_multiple_of (`int`, *optional*):
            如果设置,将序列填充到提供的值的倍数。
    
            这对于在具有计算能力 >= 7.5(Volta)的 NVIDIA 硬件上启用张量核心特别有用。
        label_pad_token_id (`int`, *optional*, defaults to -100):
            用于填充标签时的标识符(-100 将被 PyTorch 损失函数自动忽略)。
        return_tensors (`str`, *optional*, defaults to `"pt"`):
            要返回的张量类型。允许的值有 "np"、"pt" 和 "tf"。
    def __call__(self, features, return_tensors=None):
        # 如果没有指定返回张量类型,则使用预设的 return_tensors
        if return_tensors is None:
            return_tensors = self.return_tensors
        # 从 features 中提取标签列表,如果 features 的第一个元素包含 "labels" 键
        labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
        # 在调用 `tokenizer.pad` 之前,需要对标签进行填充,因为该方法不会进行填充,并且需要所有标签长度相同以返回张量
        if labels is not None:
            # 计算最长的标签长度
            max_label_length = max(len(l) for l in labels)
            # 如果指定了 pad_to_multiple_of,调整最大标签长度使其成为该值的倍数
            if self.pad_to_multiple_of is not None:
                max_label_length = (
                    (max_label_length + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
                )

            # 获取填充的位置(左侧或右侧)
            padding_side = self.tokenizer.padding_side
            # 对每个 feature 进行标签填充
            for feature in features:
                # 计算需要填充的空位,用 label_pad_token_id 填充
                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
                # 如果标签是列表
                if isinstance(feature["labels"], list):
                    feature["labels"] = (
                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
                    )
                # 如果填充在右侧
                elif padding_side == "right":
                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
                # 如果填充在左侧
                else:
                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)

        # 使用 pad_without_fast_tokenizer_warning 函数对 features 进行填充,避免使用快速分词器警告
        features = pad_without_fast_tokenizer_warning(
            self.tokenizer,
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=return_tensors,
        )

        # 准备 decoder_input_ids
        if (
            labels is not None
            and self.model is not None
            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
        ):
            # 根据标签准备 decoder_input_ids
            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
            features["decoder_input_ids"] = decoder_input_ids

        # 返回处理后的 features
        return features
    """
    Language modeling数据收集器。如果输入的长度不同,输入会被动态填充到一个batch的最大长度。

    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            用于编码数据的分词器。
        mlm (`bool`, *optional*, defaults to `True`):
            是否使用masked language modeling。如果设置为`False`,则标签与输入相同,忽略填充的标记(通过将它们设置为-100)。否则,非masked的标记为-100,要预测的masked标记为其他值。
        mlm_probability (`float`, *optional*, defaults to 0.15):
            当`mlm`设置为`True`时,随机mask输入中的token的概率。
        pad_to_multiple_of (`int`, *optional*):
            如果设置,则将序列填充到提供的值的倍数。
        return_tensors (`str`):
            要返回的Tensor类型。允许的值为"np"、"pt"和"tf"。

    <Tip>

    为了最佳性能,此数据收集器应与具有项为字典或BatchEncoding的数据集一起使用,这些数据集具有"special_tokens_mask"键,该键由[`PreTrainedTokenizer`]或[`PreTrainedTokenizerFast`]返回,参数`return_special_tokens_mask=True`。

    </Tip>"""

    tokenizer: PreTrainedTokenizerBase
    mlm: bool = True
    mlm_probability: float = 0.15
    pad_to_multiple_of: Optional[int] = None
    tf_experimental_compile: bool = False
    return_tensors: str = "pt"

    def __post_init__(self):
        if self.mlm and self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
                "You should pass `mlm=False` to train on causal language modeling instead."
            )
        if self.tf_experimental_compile:
            import tensorflow as tf

            self.tf_mask_tokens = tf.function(self.tf_mask_tokens, jit_compile=True)

    @staticmethod
    def tf_bernoulli(shape, probability):
        import tensorflow as tf

        prob_matrix = tf.fill(shape, probability)
        return tf.cast(prob_matrix - tf.random.uniform(shape, 0, 1) >= 0, tf.bool)

    def tf_mask_tokens(
        self, inputs: Any, vocab_size, mask_token_id, special_tokens_mask: Optional[Any] = None
    ):
        """
        用于在TensorFlow中执行token masking的函数。

        Args:
            inputs (Any): 输入的数据(例如token IDs)。
            vocab_size (int): 词汇表的大小。
            mask_token_id (int): 要用作masked token的token ID。
            special_tokens_mask (Optional[Any]): 特殊token的mask,与词汇表外的token相关。

        """
    ) -> Tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        import tensorflow as tf

        # 将 mask_token_id 转换为与 inputs 相同的数据类型
        mask_token_id = tf.cast(mask_token_id, inputs.dtype)

        # 获取输入张量的形状
        input_shape = tf.shape(inputs)

        # 为了进行 MLM 训练,以概率 self.mlm_probability 对每个序列中的部分 token 进行掩码操作
        masked_indices = self.tf_bernoulli(input_shape, self.mlm_probability) & ~special_tokens_mask

        # 用 -100 替换 labels 中未被掩码的位置,因为损失只计算掩码 token 的损失
        labels = tf.where(masked_indices, inputs, -100)

        # 80% 的概率用 mask_token_id 替换掩码的输入 token
        indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices
        inputs = tf.where(indices_replaced, mask_token_id, inputs)

        # 10% 的概率用随机单词替换掩码的输入 token
        indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)
        inputs = tf.where(indices_random, random_words, inputs)

        # 剩余 10% 的概率保持掩码的输入 token 不变
        return inputs, labels
    # 定义一个 TensorFlow 模型的调用函数,处理输入的样本列表并返回预测结果字典
    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        import tensorflow as tf

        # 根据输入样本类型的不同进行处理:如果是字典,则使用自定义函数进行填充和转换为张量
        if isinstance(examples[0], Mapping):
            batch = pad_without_fast_tokenizer_warning(
                self.tokenizer, examples, return_tensors="tf", pad_to_multiple_of=self.pad_to_multiple_of
            )
        else:
            # 否则,将样本列表转换为包含 "input_ids" 键的字典,使用内置函数进行填充
            batch = {
                "input_ids": _tf_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
            }

        # 如果预处理了特殊 token 掩码,则从字典中移除该项
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        
        # 如果采用 MLM(Masked Language Modeling),则进行相应处理
        if self.mlm:
            if special_tokens_mask is None:
                # 如果没有预处理特殊 token 掩码,根据输入的 input_ids 创建掩码
                special_tokens_mask = [
                    self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
                    for val in batch["input_ids"].numpy().tolist()
                ]
                # 将掩码转换为 TensorFlow 中的布尔类型张量
                special_tokens_mask = tf.cast(tf.convert_to_tensor(special_tokens_mask, dtype=tf.int64), tf.bool)
            else:
                # 否则,直接将已有的特殊 token 掩码转换为 TensorFlow 的布尔类型张量
                special_tokens_mask = tf.cast(special_tokens_mask, tf.bool)
            
            # 使用 TensorFlow 函数 tf_mask_tokens 处理 input_ids 和 labels,并更新 batch 字典
            batch["input_ids"], batch["labels"] = self.tf_mask_tokens(
                tf.cast(batch["input_ids"], tf.int64),
                special_tokens_mask=special_tokens_mask,
                mask_token_id=self.tokenizer.mask_token_id,
                vocab_size=len(self.tokenizer),
            )
        else:
            # 如果不是 MLM 模式,则直接将 input_ids 作为 labels,同时处理 padding 的情况
            labels = batch["input_ids"]
            if self.tokenizer.pad_token_id is not None:
                # 将 padding 的位置替换为 -100
                labels = tf.where(labels == self.tokenizer.pad_token_id, -100, labels)
            else:
                # 如果没有定义 pad_token_id,创建 labels 的深拷贝以防万一
                labels = tf.identity(labels)
            batch["labels"] = labels
        
        # 返回处理后的 batch 字典,其中包含处理过的 input_ids 和相应的 labels
        return batch
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # 处理输入数据 examples,可以是字典或列表,根据不同类型进行填充和转换为张量。
        if isinstance(examples[0], Mapping):
            # 对字典类型的 examples 进行填充,使用自定义的 pad_without_fast_tokenizer_warning 函数进行填充,并返回 PyTorch 张量。
            batch = pad_without_fast_tokenizer_warning(
                self.tokenizer, examples, return_tensors="pt", pad_to_multiple_of=self.pad_to_multiple_of
            )
        else:
            # 对列表类型的 examples 进行处理,仅填充 "input_ids" 键,调用 _torch_collate_batch 函数进行填充。
            batch = {
                "input_ids": _torch_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
            }

        # 如果特殊标记掩码已经预处理,则从字典中移除。
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        if self.mlm:
            # 如果是 MLM(Masked Language Modeling)任务,调用 torch_mask_tokens 函数处理 input_ids 和 labels。
            batch["input_ids"], batch["labels"] = self.torch_mask_tokens(
                batch["input_ids"], special_tokens_mask=special_tokens_mask
            )
        else:
            # 如果不是 MLM 任务,将 input_ids 复制到 labels,并根据 tokenizer 的 pad_token_id 设置 labels 中相应位置的值为 -100。
            labels = batch["input_ids"].clone()
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        return batch

    def torch_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
        """
        准备用于掩码语言建模的输入/标签:80% MASK,10% 随机词,10% 原始词。
        """
        import torch

        labels = inputs.clone()
        # 对每个序列进行 MLM 训练时,以概率 self.mlm_probability 对输入进行掩码。
        probability_matrix = torch.full(labels.shape, self.mlm_probability)
        if special_tokens_mask is None:
            # 如果特殊标记掩码为空,则使用 tokenizer 获取每个序列的特殊标记掩码。
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        # 根据特殊标记掩码,将概率矩阵中的特定位置置为 0.0。
        probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
        # 使用伯努利分布生成掩码索引。
        masked_indices = torch.bernoulli(probability_matrix).bool()
        labels[~masked_indices] = -100  # 只计算掩码位置的损失

        # 80% 的时间,用 tokenizer.mask_token ([MASK]) 替换掩码位置的输入标记。
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% 的时间,用随机词替换掩码位置的输入标记。
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # 剩余 10% 的时间,保持掩码位置的输入标记不变。
        return inputs, labels
    # 定义一个方法,用于处理包含各种数据结构的示例列表,并返回处理后的结果字典
    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # 如果第一个示例是字典类型,则使用适当的填充方式和转换为张量
        if isinstance(examples[0], Mapping):
            # 使用适当的填充方法(避免快速分词器警告),将示例列表转换为 NumPy 张量
            batch = pad_without_fast_tokenizer_warning(
                self.tokenizer, examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of
            )
        else:
            # 如果示例不是字典类型,则只包含输入 ID 的批次
            batch = {
                "input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
            }

        # 如果已预处理特殊标记掩码,则从字典中弹出
        special_tokens_mask = batch.pop("special_tokens_mask", None)
        # 如果是 MLM 任务,调用方法对输入 ID 进行掩码处理,并将结果存入 batch 中
        if self.mlm:
            batch["input_ids"], batch["labels"] = self.numpy_mask_tokens(
                batch["input_ids"], special_tokens_mask=special_tokens_mask
            )
        else:
            # 如果不是 MLM 任务,则创建 labels 副本,并将填充标记转换为 -100
            labels = np.copy(batch["input_ids"])
            if self.tokenizer.pad_token_id is not None:
                labels[labels == self.tokenizer.pad_token_id] = -100
            batch["labels"] = labels
        # 返回处理后的批次数据字典
        return batch
    def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
        # 创建输入的副本作为标签
        labels = np.copy(inputs)

        # 创建一个与输入形状相同的概率矩阵,每个位置的值为 self.mlm_probability
        probability_matrix = np.full(labels.shape, self.mlm_probability)

        # 如果特殊标记掩码为空,则根据每个序列的值获取特殊标记掩码
        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = np.array(special_tokens_mask, dtype=bool)
        else:
            special_tokens_mask = special_tokens_mask.astype(bool)

        # 将特殊标记掩码位置的概率设为 0,这些位置不会被选为被屏蔽的位置
        probability_matrix[special_tokens_mask] = 0

        # 使用二项分布随机生成屏蔽的索引
        masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)

        # 将未屏蔽的标签设为 -100,用于损失计算
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 80% 的概率,将屏蔽的输入标记替换为 tokenizer.mask_token_id
        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
        inputs[indices_replaced] = self.tokenizer.mask_token_id

        # 10% 的概率,将屏蔽的输入标记替换为随机词
        indices_random = (
            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
        )
        random_words = np.random.randint(
            low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
        )
        inputs[indices_random] = random_words

        # 剩余的 10% 的概率,保持屏蔽的输入标记不变

        # 返回处理后的输入和标签
        return inputs, labels
@DataCollatorForWholeWordMask
class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
    """
    Data collator used for language modeling that masks entire words.

    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for masked language modeling

    <Tip>

    This collator relies on details of the implementation of subword tokenization by [`BertTokenizer`], specifically
    that subword tokens are prefixed with *##*. For tokenizers that do not adhere to this scheme, this collator will
    produce an output that is roughly equivalent to [`.DataCollatorForLanguageModeling`].

    </Tip>
    """

    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # Determine if examples are provided as a list of mappings or as a list of input_ids
        if isinstance(examples[0], Mapping):
            input_ids = [e["input_ids"] for e in examples]  # Extract input_ids from each example mapping
        else:
            input_ids = examples  # Examples are directly input_ids, wrap each in a mapping

        # Collate input_ids into a batch tensor respecting tokenizer's padding rules
        batch_input = _torch_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)

        mask_labels = []
        for e in examples:
            ref_tokens = []
            for id in tolist(e["input_ids"]):  # Convert input_ids to tokens using tokenizer
                token = self.tokenizer._convert_id_to_token(id)
                ref_tokens.append(token)

            # For Chinese tokens, mark sub-words with "##", e.g., [喜,欢]->[喜,##欢]
            if "chinese_ref" in e:
                ref_pos = tolist(e["chinese_ref"])  # Positions in input_ids that are sub-words
                len_seq = len(e["input_ids"])  # Length of the input sequence
                for i in range(len_seq):
                    if i in ref_pos:
                        ref_tokens[i] = "##" + ref_tokens[i]  # Prefix sub-word tokens with "##"

            mask_labels.append(self._whole_word_mask(ref_tokens))  # Apply whole word masking to tokens

        # Collate mask_labels into a batch tensor respecting tokenizer's padding rules
        batch_mask = _torch_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)

        # Mask input_ids and create labels for masked language modeling
        inputs, labels = self.torch_mask_tokens(batch_input, batch_mask)

        return {"input_ids": inputs, "labels": labels}
    # 定义 TensorFlow 版本的调用方法,接受一个例子列表,并返回处理后的输入和标签字典
    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # 导入 TensorFlow 库
        import tensorflow as tf

        # 检查第一个例子的类型,若为映射类型(字典),则提取其中的 "input_ids" 列表
        if isinstance(examples[0], Mapping):
            input_ids = [e["input_ids"] for e in examples]
        else:
            # 否则,假设每个例子本身就是一个 input_ids 列表,将其赋值给 input_ids,并用例子包装成带 "input_ids" 键的字典列表
            input_ids = examples
            examples = [{"input_ids": e} for e in examples]

        # 调用内部函数 _tf_collate_batch,将 input_ids 列表和 tokenizer 进行批处理
        batch_input = _tf_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)

        # 初始化一个空列表,用于存储每个例子的掩码标签
        mask_labels = []
        for e in examples:
            ref_tokens = []
            # 遍历每个例子中的 input_ids,将每个 id 转换为对应的 token
            for id in tolist(e["input_ids"]):
                token = self.tokenizer._convert_id_to_token(id)
                ref_tokens.append(token)

            # 对于中文 token,如果指定了 "chinese_ref" 键,需添加额外的标记 "##" 标识子词,例如 [喜,欢]-> [喜,##欢]
            if "chinese_ref" in e:
                ref_pos = tolist(e["chinese_ref"])
                len_seq = len(e["input_ids"])
                for i in range(len_seq):
                    if i in ref_pos:
                        ref_tokens[i] = "##" + ref_tokens[i]
            # 将处理后的 token 列表传入 _whole_word_mask 方法,得到该例子的掩码标签,添加到 mask_labels 列表中
            mask_labels.append(self._whole_word_mask(ref_tokens))

        # 再次调用 _tf_collate_batch,将 mask_labels 列表和 tokenizer 进行批处理,得到批量化的掩码标签
        batch_mask = _tf_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
        # 调用对象自身的 tf_mask_tokens 方法,传入批量化的输入和掩码标签,得到 inputs 和 labels,返回作为字典的 "input_ids" 和 "labels"
        inputs, labels = self.tf_mask_tokens(tf.cast(batch_input, tf.int64), batch_mask)
        return {"input_ids": inputs, "labels": labels}

    # 定义 NumPy 版本的调用方法,接受一个例子列表,并返回处理后的输入和标签字典
    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        # 检查第一个例子的类型,若为映射类型(字典),则提取其中的 "input_ids" 列表
        if isinstance(examples[0], Mapping):
            input_ids = [e["input_ids"] for e in examples]
        else:
            # 否则,假设每个例子本身就是一个 input_ids 列表,将其赋值给 input_ids,并用例子包装成带 "input_ids" 键的字典列表
            input_ids = examples
            examples = [{"input_ids": e} for e in examples]

        # 调用内部函数 _numpy_collate_batch,将 input_ids 列表和 tokenizer 进行批处理
        batch_input = _numpy_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)

        # 初始化一个空列表,用于存储每个例子的掩码标签
        mask_labels = []
        for e in examples:
            ref_tokens = []
            # 遍历每个例子中的 input_ids,将每个 id 转换为对应的 token
            for id in tolist(e["input_ids"]):
                token = self.tokenizer._convert_id_to_token(id)
                ref_tokens.append(token)

            # 对于中文 token,如果指定了 "chinese_ref" 键,需添加额外的标记 "##" 标识子词,例如 [喜,欢]-> [喜,##欢]
            if "chinese_ref" in e:
                ref_pos = tolist(e["chinese_ref"])
                len_seq = len(e["input_ids"])
                for i in range(len_seq):
                    if i in ref_pos:
                        ref_tokens[i] = "##" + ref_tokens[i]
            # 将处理后的 token 列表传入 _whole_word_mask 方法,得到该例子的掩码标签,添加到 mask_labels 列表中
            mask_labels.append(self._whole_word_mask(ref_tokens))

        # 再次调用 _numpy_collate_batch,将 mask_labels 列表和 tokenizer 进行批处理,得到批量化的掩码标签
        batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
        # 调用对象自身的 numpy_mask_tokens 方法,传入批量化的输入和掩码标签,得到 inputs 和 labels,返回作为字典的 "input_ids" 和 "labels"
        inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
        return {"input_ids": inputs, "labels": labels}
    def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
        """
        Get 0/1 labels for masked tokens with whole word mask proxy
        """
        # 如果当前的分词器不是BertTokenizer或BertTokenizerFast,则发出警告
        if not isinstance(self.tokenizer, (BertTokenizer, BertTokenizerFast)):
            warnings.warn(
                "DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. "
                "Please refer to the documentation for more information."
            )

        # 初始化候选索引列表
        cand_indexes = []
        # 遍历输入的token列表
        for i, token in enumerate(input_tokens):
            # 跳过特殊token,如"[CLS]"和"[SEP]"
            if token == "[CLS]" or token == "[SEP]":
                continue

            # 如果当前候选索引列表不为空且当前token是一个以"##"开头的部分token,则将当前token加入最后一个候选索引的列表中
            if len(cand_indexes) >= 1 and token.startswith("##"):
                cand_indexes[-1].append(i)
            else:
                # 否则,创建一个新的候选索引列表并加入当前token的索引
                cand_indexes.append([i])

        # 随机打乱候选索引列表
        random.shuffle(cand_indexes)
        # 计算应该预测的masked token数量,取最小值为max_predictions和输入token数量乘以mlm_probability的整数部分
        num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
        # 初始化masked tokens列表
        masked_lms = []
        # 初始化已覆盖索引的集合
        covered_indexes = set()
        # 遍历候选索引列表
        for index_set in cand_indexes:
            # 如果已经预测的masked token数量达到了num_to_predict,则退出循环
            if len(masked_lms) >= num_to_predict:
                break
            # 如果当前候选索引集合加上已预测的masked token数量超过了num_to_predict,则跳过该候选集合
            if len(masked_lms) + len(index_set) > num_to_predict:
                continue
            # 检查当前候选索引集合中是否有已覆盖的索引
            is_any_index_covered = False
            for index in index_set:
                if index in covered_indexes:
                    is_any_index_covered = True
                    break
            # 如果有任何已覆盖的索引,则跳过该候选索引集合
            if is_any_index_covered:
                continue
            # 否则,将候选索引集合中的每个索引加入已覆盖索引集合,并将其加入masked tokens列表
            for index in index_set:
                covered_indexes.add(index)
                masked_lms.append(index)

        # 如果已覆盖索引的数量不等于masked tokens列表的长度,则抛出异常
        if len(covered_indexes) != len(masked_lms):
            raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
        # 根据已覆盖的索引集合生成mask标签列表,即标记哪些token是masked的
        mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
        # 返回mask标签列表作为结果
        return mask_labels
    def torch_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
        """
        import torch

        # 检查当前的分词器是否有掩码标记,这是进行掩码语言建模所必需的
        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
                " --mlm flag if you want to use this tokenizer."
            )
        # 复制输入以保留原始标签
        labels = inputs.clone()

        # 我们在每个序列中随机抽样几个标记,用于掩码语言建模训练(概率默认为0.15,适用于Bert/RoBERTa)
        probability_matrix = mask_labels

        # 获取特殊标记的掩码,用于排除掉特殊标记的影响
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)

        # 如果存在填充标记,将其添加到掩码中
        if self.tokenizer._pad_token is not None:
            padding_mask = labels.eq(self.tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)

        # 确定要掩码的索引
        masked_indices = probability_matrix.bool()
        labels[~masked_indices] = -100  # 只计算掩码标记上的损失

        # 80%的时间,将掩码输入标记替换为tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10%的时间,将掩码输入标记替换为随机单词
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # 剩余的时间(10%),保持掩码输入标记不变
        return inputs, labels
    def tf_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
        """
        import tensorflow as tf  # 导入 TensorFlow 库

        input_shape = tf.shape(inputs)  # 获取输入张量的形状
        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
                " --mlm flag if you want to use this tokenizer."
            )
        labels = tf.identity(inputs)  # 创建输入张量的副本作为标签

        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)

        masked_indices = tf.cast(mask_labels, tf.bool)  # 将掩码标签转换为布尔类型张量

        # Exclude special tokens from masking
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels
        ]  # 获取特殊标记的掩码,排除已有的特殊标记
        masked_indices = masked_indices & ~tf.cast(special_tokens_mask, dtype=tf.bool)  # 更新掩码索引,排除特殊标记

        if self.tokenizer._pad_token is not None:
            padding_mask = inputs == self.tokenizer.pad_token_id  # 获取填充标记的掩码
            masked_indices = masked_indices & ~padding_mask  # 更新掩码索引,排除填充标记

        # Replace unmasked indices with -100 in the labels since we only compute loss on masked tokens
        labels = tf.where(masked_indices, inputs, -100)  # 根据掩码索引,将未掩码的位置在标签中替换为-100,仅计算掩码位置的损失

        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = self.tf_bernoulli(input_shape, 0.8) & masked_indices  # 使用伯努利采样确定掩码位置,80%的时间用[MASK]标记替换掩码输入
        inputs = tf.where(indices_replaced, self.tokenizer.mask_token_id, inputs)

        # 10% of the time, we replace masked input tokens with random word
        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced  # 使用伯努利采样确定掩码位置,10%的时间用随机词替换掩码输入
        random_words = tf.random.uniform(input_shape, maxval=len(self.tokenizer), dtype=tf.int64)  # 生成随机词
        inputs = tf.where(indices_random, random_words, inputs)

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels  # 返回处理后的输入和标签
    def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
        """
        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
                " --mlm flag if you want to use this tokenizer."
            )
        labels = np.copy(inputs)
        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)

        # Convert mask_labels to boolean array indicating which tokens to mask
        masked_indices = mask_labels.astype(bool)

        # Mask special tokens so they are not selected for masking
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0
        
        # If there is a padding token, mask it so it's not selected for masking
        if self.tokenizer._pad_token is not None:
            padding_mask = labels == self.tokenizer.pad_token_id
            masked_indices[padding_mask] = 0

        # Set labels of unmasked tokens to -100 to compute loss only on masked tokens
        labels[~masked_indices] = -100

        # 80% of the time, replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10% of the time, replace masked input tokens with random words
        indices_random = (
            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
        )
        random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
        inputs[indices_random] = random_words[indices_random]

        # The rest of the time (10% of the time), keep the masked input tokens unchanged
        return inputs, labels
@Dataclass
class DataCollatorForSOP(DataCollatorForLanguageModeling):
    """
    Data collator used for sentence order prediction task.

    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for both masked language modeling and sentence order prediction
    """

    def __init__(self, *args, **kwargs):
        # 发出警告信息,提示该类即将被弃用,并建议使用DataCollatorForLanguageModeling代替
        warnings.warn(
            "DataCollatorForSOP is deprecated and will be removed in a future version, you can now use "
            "DataCollatorForLanguageModeling instead.",
            FutureWarning,
        )

    def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, Any]:
        import torch
        from torch.nn.utils.rnn import pad_sequence

        # 从每个示例中提取input_ids列表
        input_ids = [example["input_ids"] for example in examples]
        # 调用内部方法进行批量处理和填充
        input_ids = _torch_collate_batch(input_ids, self.tokenizer)
        # 对input_ids进行遮蔽处理,生成labels和attention_mask
        input_ids, labels, attention_mask = self.mask_tokens(input_ids)

        # 从每个示例中提取token_type_ids列表
        token_type_ids = [example["token_type_ids"] for example in examples]
        # 使用pad_sequence函数对token_type_ids进行填充,保证每个批次的长度一致
        token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)

        # 从每个示例中提取sentence_order_label列表,并转换成tensor
        sop_label_list = [example["sentence_order_label"] for example in examples]
        sentence_order_label = torch.stack(sop_label_list)

        # 返回包含处理后数据的字典
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
            "sentence_order_label": sentence_order_label,
        }
    def mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any]:
        """
        Prepare masked tokens inputs/labels/attention_mask for masked language modeling: 80% MASK, 10% random, 10%
        original. N-gram not applied yet.
        """
        import torch  # 导入PyTorch库,用于张量操作

        if self.tokenizer.mask_token is None:
            raise ValueError(
                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
                " --mlm flag if you want to use this tokenizer."
            )

        labels = inputs.clone()  # 复制输入作为标签

        # 构建一个概率矩阵,决定哪些位置进行掩码处理,默认使用的概率为self.mlm_probability(通常为0.15)
        probability_matrix = torch.full(labels.shape, self.mlm_probability)

        # 获取输入序列中的特殊标记(如起始标记、结束标记等),并在概率矩阵中将这些位置的概率设为0
        special_tokens_mask = [
            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
        ]
        probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)

        # 如果存在填充标记,则在概率矩阵中将填充标记位置的概率设为0
        if self.tokenizer._pad_token is not None:
            padding_mask = labels.eq(self.tokenizer.pad_token_id)
            probability_matrix.masked_fill_(padding_mask, value=0.0)

        # 使用伯努利分布生成一个掩码的布尔张量
        masked_indices = torch.bernoulli(probability_matrix).bool()

        # 根据模型的需求,调整注意力掩码的值(有些模型中,0表示被掩码)
        attention_mask = (~masked_indices).float()

        # 如果存在填充标记,则在注意力掩码中将填充标记位置的值设为1.0
        if self.tokenizer._pad_token is not None:
            attention_padding_mask = labels.eq(self.tokenizer.pad_token_id)
            attention_mask.masked_fill_(attention_padding_mask, value=1.0)

        # 将非掩码的位置的标签值设为-100,用于计算交叉熵损失时忽略这些位置
        labels[~masked_indices] = -100

        # 80%的情况下,将掩码的输入标记替换为特定的掩码标记(如[MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 10%的情况下,将掩码的输入标记替换为随机的单词
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]

        # 剩余10%的情况下,保持掩码的输入标记不变

        # 返回处理后的输入、标签和注意力掩码
        return inputs, labels, attention_mask
# 使用 dataclass 装饰器定义一个数据类 DataCollatorForPermutationLanguageModeling,
# 用于处理排列语言建模的数据。
@dataclass
class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
    """
    Data collator used for permutation language modeling.

    - collates batches of tensors, honoring their tokenizer's pad_token
    - preprocesses batches for permutation language modeling with procedures specific to XLNet
    """

    # 初始化函数参数:tokenizer 表示预训练的分词器,plm_probability 表示置换语言建模的概率,默认为 1/6,
    # max_span_length 表示最大掩码标记序列的长度,默认为 5,
    # return_tensors 表示返回的张量类型,默认为 "pt"(PyTorch 张量)。
    tokenizer: PreTrainedTokenizerBase
    plm_probability: float = 1 / 6
    max_span_length: int = 5  # maximum length of a span of masked tokens
    return_tensors: str = "pt"

    # 定义 torch_call 方法,接收一个例子列表 examples,
    # 如果 examples 中的第一个元素是字典类型,则提取它们的 "input_ids" 字段作为例子列表的新内容。
    # 然后使用 _torch_collate_batch 函数对 examples 进行批量处理,结合 tokenizer 进行处理。
    # 最后调用 torch_mask_tokens 方法生成输入、掩码、目标映射和标签,并以字典形式返回结果。
    def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        if isinstance(examples[0], Mapping):
            examples = [e["input_ids"] for e in examples]
        batch = _torch_collate_batch(examples, self.tokenizer)
        inputs, perm_mask, target_mapping, labels = self.torch_mask_tokens(batch)
        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}

    # 定义 tf_call 方法,功能与 torch_call 方法类似,不同之处在于使用 _tf_collate_batch 函数处理 examples。
    def tf_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        if isinstance(examples[0], Mapping):
            examples = [e["input_ids"] for e in examples]
        batch = _tf_collate_batch(examples, self.tokenizer)
        inputs, perm_mask, target_mapping, labels = self.tf_mask_tokens(batch)
        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}

    # 定义 numpy_call 方法,功能与前两者相似,使用 _numpy_collate_batch 处理 examples,
    # 并调用 numpy_mask_tokens 方法生成相应的输入、掩码、目标映射和标签,以字典形式返回结果。
    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
        if isinstance(examples[0], Mapping):
            examples = [e["input_ids"] for e in examples]
        batch = _numpy_collate_batch(examples, self.tokenizer)
        inputs, perm_mask, target_mapping, labels = self.numpy_mask_tokens(batch)
        return {"input_ids": inputs, "perm_mask": perm_mask, "target_mapping": target_mapping, "labels": labels}

.\data\metrics\squad_metrics.py

# 导入必要的模块和库
import collections  # 导入collections模块,用于处理数据集合
import json  # 导入json模块,用于处理JSON格式数据
import math  # 导入math模块,提供数学运算函数
import re  # 导入re模块,提供正则表达式操作
import string  # 导入string模块,提供字符串处理功能

from ...models.bert import BasicTokenizer  # 从bert模型中导入BasicTokenizer类
from ...utils import logging  # 从工具模块中导入logging模块

# 获取logger对象用于记录日志
logger = logging.get_logger(__name__)


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        # 定义函数移除文本中的冠词(a, an, the)
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        # 移除文本中多余的空白符
        return " ".join(text.split())

    def remove_punc(text):
        # 移除文本中的标点符号
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        # 将文本转换为小写
        return text.lower()

    # 对输入文本s进行规范化处理,依次去除冠词、修复空白符、移除标点、转换为小写
    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_tokens(s):
    # 如果输入文本s为空,则返回空列表
    if not s:
        return []
    # 对规范化后的文本s进行分词处理,并返回分词结果列表
    return normalize_answer(s).split()


def compute_exact(a_gold, a_pred):
    # 计算精确匹配得分,如果规范化后的答案a_gold与a_pred相同则返回1,否则返回0
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_f1(a_gold, a_pred):
    # 计算F1得分
    gold_toks = get_tokens(a_gold)  # 获取标准答案的分词列表
    pred_toks = get_tokens(a_pred)  # 获取预测答案的分词列表
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)  # 计算分词列表的交集
    num_same = sum(common.values())  # 计算交集中元素的总数

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # 如果标准答案或预测答案的分词列表为空,则如果它们相等返回1,否则返回0
        return int(gold_toks == pred_toks)

    if num_same == 0:
        # 如果交集中没有相同的分词,则返回0
        return 0

    precision = 1.0 * num_same / len(pred_toks)  # 计算精确率
    recall = 1.0 * num_same / len(gold_toks)  # 计算召回率
    f1 = (2 * precision * recall) / (precision + recall)  # 计算F1得分
    return f1


def get_raw_scores(examples, preds):
    """
    Computes the exact and f1 scores from the examples and the model predictions
    """
    exact_scores = {}  # 初始化精确匹配得分字典
    f1_scores = {}  # 初始化F1得分字典
    # 对于每个示例中的问题,获取问题的唯一标识符
    qas_id = example.qas_id
    # 获取示例中所有答案的文本,仅保留标准化后不为空的答案
    gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]

    # 如果没有标准化后不为空的答案,则对于不可回答的问题,正确答案设置为空字符串
    if not gold_answers:
        gold_answers = [""]

    # 如果预测结果中没有当前问题的预测值,则输出一条缺失预测的警告信息并跳过当前问题
    if qas_id not in preds:
        print(f"Missing prediction for {qas_id}")
        continue

    # 获取当前问题的预测值
    prediction = preds[qas_id]
    # 计算所有标准化后不为空的答案与预测值之间的最大精确匹配分数
    exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
    # 计算所有标准化后不为空的答案与预测值之间的最大 F1 分数
    f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)

# 返回所有问题的精确匹配分数和 F1 分数
return exact_scores, f1_scores
# 根据预测分数、不可回答概率、问题ID到是否有答案的映射以及阈值,应用无答案阈值,并返回新的分数字典
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
    new_scores = {}
    # 遍历每个问题ID和对应的分数
    for qid, s in scores.items():
        # 预测该问题是否为无答案,根据不可回答概率和设定的阈值
        pred_na = na_probs[qid] > na_prob_thresh
        if pred_na:
            # 如果预测为无答案,将该问题的分数设为0或1(取决于是否有答案)
            new_scores[qid] = float(not qid_to_has_ans[qid])
        else:
            # 如果预测为有答案,则保持原始分数
            new_scores[qid] = s
    return new_scores


# 根据精确匹配分数和F1分数以及指定的问题ID列表(如果没有提供,则使用全部问题ID),生成评估结果字典
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
    if not qid_list:
        total = len(exact_scores)
        return collections.OrderedDict(
            [
                ("exact", 100.0 * sum(exact_scores.values()) / total),
                ("f1", 100.0 * sum(f1_scores.values()) / total),
                ("total", total),
            ]
        )
    else:
        total = len(qid_list)
        return collections.OrderedDict(
            [
                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
                ("total", total),
            ]
        )


# 将新的评估结果合并到主要评估结果字典中,使用指定的前缀
def merge_eval(main_eval, new_eval, prefix):
    for k in new_eval:
        main_eval[f"{prefix}_{k}"] = new_eval[k]


# 找到最佳阈值的版本2,根据预测、分数、不可回答概率和问题ID到是否有答案的映射来确定
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
    cur_score = num_no_ans
    best_score = cur_score
    best_thresh = 0.0
    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
    # 遍历排序后的问题ID列表,计算当前分数和最佳分数以及最佳阈值
    for i, qid in enumerate(qid_list):
        if qid not in scores:
            continue
        if qid_to_has_ans[qid]:
            diff = scores[qid]
        else:
            if preds[qid]:
                diff = -1
            else:
                diff = 0
        cur_score += diff
        if cur_score > best_score:
            best_score = cur_score
            best_thresh = na_probs[qid]

    has_ans_score, has_ans_cnt = 0, 0
    # 统计有答案的问题的分数总和和数量
    for qid in qid_list:
        if not qid_to_has_ans[qid]:
            continue
        has_ans_cnt += 1

        if qid not in scores:
            continue
        has_ans_score += scores[qid]

    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt


# 找到所有版本2的最佳阈值,将精确匹配和F1分数的最佳结果合并到主要评估结果中
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
    main_eval["best_exact"] = best_exact
    main_eval["best_exact_thresh"] = exact_thresh
    main_eval["best_f1"] = best_f1
    main_eval["best_f1_thresh"] = f1_thresh
    main_eval["has_ans_exact"] = has_ans_exact
    main_eval["has_ans_f1"] = has_ans_f1


# 找到最佳阈值,根据预测、分数、不可回答概率和问题ID到是否有答案的映射来确定
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
    cur_score = num_no_ans
    best_score = cur_score
    best_thresh = 0.0
    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
    # 使用 enumerate 函数遍历 qid_list 中的元素,索引不关心
    for _, qid in enumerate(qid_list):
        # 如果 qid 不在 scores 字典中,跳过当前循环,继续下一个 qid
        if qid not in scores:
            continue
        # 如果 qid 对应的问题有答案(True),取出其对应的分数
        if qid_to_has_ans[qid]:
            diff = scores[qid]
        else:
            # 如果 qid 对应的问题没有答案:
            # 如果 preds[qid] 为真,则设置 diff 为 -1
            if preds[qid]:
                diff = -1
            # 否则,设置 diff 为 0
            else:
                diff = 0
        # 将 diff 加到当前得分 cur_score 上
        cur_score += diff
        # 如果当前得分 cur_score 大于最佳得分 best_score
        if cur_score > best_score:
            # 更新最佳得分为当前得分
            best_score = cur_score
            # 更新最佳阈值为 na_probs[qid]
            best_thresh = na_probs[qid]
    # 计算最终得分比例并返回,乘以 100.0 以得到百分比
    return 100.0 * best_score / len(scores), best_thresh
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
    # 调用 find_best_thresh 函数获取最佳的 exact 和 exact 阈值
    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
    # 调用 find_best_thresh 函数获取最佳的 f1 和 f1 阈值
    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)

    # 将计算得到的最佳 exact 和 exact 阈值存储到 main_eval 字典中
    main_eval["best_exact"] = best_exact
    main_eval["best_exact_thresh"] = exact_thresh
    # 将计算得到的最佳 f1 和 f1 阈值存储到 main_eval 字典中
    main_eval["best_f1"] = best_f1
    main_eval["best_f1_thresh"] = f1_thresh


def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
    # 创建一个字典,记录每个示例的 qas_id 是否有答案
    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
    # 获取有答案的 qas_id 列表和没有答案的 qas_id 列表
    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]

    # 如果没有提供 no_answer_probs,则初始化为所有预测结果的概率为 0.0
    if no_answer_probs is None:
        no_answer_probs = {k: 0.0 for k in preds}

    # 计算 exact 和 f1 得分
    exact, f1 = get_raw_scores(examples, preds)

    # 应用 no_answer_probability_threshold 进行 exact 和 f1 阈值处理
    exact_threshold = apply_no_ans_threshold(
        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
    )
    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)

    # 根据处理后的 exact_threshold 和 f1_threshold 创建评估字典
    evaluation = make_eval_dict(exact_threshold, f1_threshold)

    # 如果存在有答案的 qas_id,则对有答案的部分进行评估
    if has_answer_qids:
        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
        merge_eval(evaluation, has_ans_eval, "HasAns")

    # 如果存在没有答案的 qas_id,则对没有答案的部分进行评估
    if no_answer_qids:
        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
        merge_eval(evaluation, no_ans_eval, "NoAns")

    # 查找所有最佳的阈值,并更新到 evaluation 字典中
    if no_answer_probs:
        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_ans)

    # 返回最终的评估结果字典
    return evaluation


def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""

    # 在创建数据时,我们记录了原始(按空格分词的)tokens和我们的WordPiece分词tokens之间的对齐。
    # 现在,`orig_text`包含了我们预测的原始文本对应的原始文本段。
    #
    # 但是,`orig_text`可能包含我们不想要的额外字符。
    #
    # 例如,假设:
    #   pred_text = steve smith
    #   orig_text = Steve Smith's
    #
    # 我们不希望返回`orig_text`,因为它包含额外的"'s"。
    #
    # 我们也不希望返回`pred_text`,因为它已经被标准化了
    # (SQuAD评估脚本也会去除标点符号/小写化,但我们的分词器会进行额外的标准化,比如去除重音字符)。
    #
    # 我们真正想返回的是"Steve Smith"。
    #
    # 因此,我们必须应用一种半复杂的对齐启发式方法,使`pred_text`和`orig_text`之间的字符对齐。
    # 在某些情况下可能会失败,此时我们只返回 `orig_text`。

    def _strip_spaces(text):
        # 初始化一个空列表,用于存储非空格字符
        ns_chars = []
        # 使用有序字典记录非空格字符在原始文本中的索引映射关系
        ns_to_s_map = collections.OrderedDict()
        # 遍历原始文本的字符和索引
        for i, c in enumerate(text):
            # 如果字符是空格,则跳过
            if c == " ":
                continue
            # 记录非空格字符在新文本中的索引,原始索引为 i
            ns_to_s_map[len(ns_chars)] = i
            # 将非空格字符添加到列表中
            ns_chars.append(c)
        # 构建新的没有空格的文本
        ns_text = "".join(ns_chars)
        return (ns_text, ns_to_s_map)

    # 首先对 `orig_text` 和 `pred_text` 进行分词,去除空格,并检查它们是否长度相同。
    # 如果它们长度不相同,则启发式方法失败。如果它们长度相同,则假定字符是一对一对齐的。
    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)

    # 对 `orig_text` 进行分词
    tok_text = " ".join(tokenizer.tokenize(orig_text))

    # 查找 `pred_text` 在 `tok_text` 中的起始位置
    start_position = tok_text.find(pred_text)
    if start_position == -1:
        # 如果找不到 `pred_text`,且启用了详细日志记录,则记录日志并返回原始文本
        if verbose_logging:
            logger.info(f"Unable to find text: '{pred_text}' in '{orig_text}'")
        return orig_text
    # 计算 `pred_text` 在 `tok_text` 中的结束位置
    end_position = start_position + len(pred_text) - 1

    # 去除 `orig_text` 和 `tok_text` 中的空格,获取新的文本及其字符映射关系
    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)

    # 如果去除空格后 `orig_text` 和 `tok_text` 长度不相等,则记录日志并返回原始文本
    if len(orig_ns_text) != len(tok_ns_text):
        if verbose_logging:
            logger.info(f"Length not equal after stripping spaces: '{orig_ns_text}' vs '{tok_ns_text}'")
        return orig_text

    # 使用字符对齐映射将 `pred_text` 的字符映射回 `orig_text`
    tok_s_to_ns_map = {}
    for i, tok_index in tok_ns_to_s_map.items():
        tok_s_to_ns_map[tok_index] = i

    orig_start_position = None
    # 如果起始位置在映射表中,则获取原始文本中的起始位置
    if start_position in tok_s_to_ns_map:
        ns_start_position = tok_s_to_ns_map[start_position]
        if ns_start_position in orig_ns_to_s_map:
            orig_start_position = orig_ns_to_s_map[ns_start_position]

    # 如果无法映射起始位置,则记录日志并返回原始文本
    if orig_start_position is None:
        if verbose_logging:
            logger.info("Couldn't map start position")
        return orig_text

    orig_end_position = None
    # 如果结束位置在映射表中,则获取原始文本中的结束位置
    if end_position in tok_s_to_ns_map:
        ns_end_position = tok_s_to_ns_map[end_position]
        if ns_end_position in orig_ns_to_s_map:
            orig_end_position = orig_ns_to_s_map[ns_end_position]

    # 如果无法映射结束位置,则记录日志并返回原始文本
    if orig_end_position is None:
        if verbose_logging:
            logger.info("Couldn't map end position")
        return orig_text

    # 根据映射的起始和结束位置从 `orig_text` 中提取输出文本
    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
    return output_text
# 从给定的logits列表中获取前n_best_size个最高的索引
def _get_best_indexes(logits, n_best_size):
    # 对(索引, 分数)对进行排序,按照分数降序排列
    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)

    best_indexes = []
    for i in range(len(index_and_score)):
        if i >= n_best_size:
            break
        best_indexes.append(index_and_score[i][0])
    return best_indexes


# 计算原始logits的softmax概率
def _compute_softmax(scores):
    # 如果scores为空,则返回空列表
    if not scores:
        return []

    max_score = None
    # 找出scores中的最大值
    for score in scores:
        if max_score is None or score > max_score:
            max_score = score

    exp_scores = []
    total_sum = 0.0
    # 计算softmax的分子部分(exp(score - max_score))
    for score in scores:
        x = math.exp(score - max_score)
        exp_scores.append(x)
        total_sum += x

    probs = []
    # 计算softmax概率值
    for score in exp_scores:
        probs.append(score / total_sum)
    return probs


# 写最终预测结果到JSON文件,并在需要时记录空结果的log-odds
def compute_predictions_logits(
    all_examples,
    all_features,
    all_results,
    n_best_size,
    max_answer_length,
    do_lower_case,
    output_prediction_file,
    output_nbest_file,
    output_null_log_odds_file,
    verbose_logging,
    version_2_with_negative,
    null_score_diff_threshold,
    tokenizer,
):
    # 如果需要,记录预测结果到output_prediction_file
    if output_prediction_file:
        logger.info(f"Writing predictions to: {output_prediction_file}")
    # 如果需要,记录nbest结果到output_nbest_file
    if output_nbest_file:
        logger.info(f"Writing nbest to: {output_nbest_file}")
    # 如果version_2_with_negative为True且需要,记录null_log_odds到output_null_log_odds_file
    if output_null_log_odds_file and version_2_with_negative:
        logger.info(f"Writing null_log_odds to: {output_null_log_odds_file}")

    # 根据example_index将all_features分组
    example_index_to_features = collections.defaultdict(list)
    for feature in all_features:
        example_index_to_features[feature.example_index].append(feature)

    # 将all_results转换为unique_id到result的映射
    unique_id_to_result = {}
    for result in all_results:
        unique_id_to_result[result.unique_id] = result

    # 定义用于存储预测结果的命名元组类型_PrelimPrediction
    _PrelimPrediction = collections.namedtuple(
        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
    )

    # 用于存储所有预测结果的有序字典
    all_predictions = collections.OrderedDict()
    # 用于存储所有nbest结果的有序字典
    all_nbest_json = collections.OrderedDict()
    # 用于存储scores_diff的JSON结果的有序字典
    scores_diff_json = collections.OrderedDict()

    # 如果需要,将all_predictions写入到output_prediction_file中
    if output_prediction_file:
        with open(output_prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")

    # 如果需要,将all_nbest_json写入到output_nbest_file中
    if output_nbest_file:
        with open(output_nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

    # 如果version_2_with_negative为True且需要,将scores_diff_json写入到output_null_log_odds_file中
    if output_null_log_odds_file and version_2_with_negative:
        with open(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

    # 返回所有预测结果的有序字典
    return all_predictions


# 计算预测结果的对数概率
def compute_predictions_log_probs(
    all_examples,
    all_features,
    all_results,
    n_best_size,
    max_answer_length,
    output_prediction_file,
    output_nbest_file,
    output_null_log_odds_file,
    start_n_top,
):
    # 这部分代码未提供完整,无法添加注释
    end_n_top,  # 变量名,可能表示某种结束条件或者顶部数量
    version_2_with_negative,  # 变量名,可能表示一个标志或配置选项,用来指示是否包含负数版本
    tokenizer,  # 变量名,可能是一个用于文本处理的工具或者模块,例如分词器
    verbose_logging,  # 变量名,可能表示一个标志或配置选项,用于控制是否输出详细日志信息
# 定义一个命名元组 `_PrelimPrediction`,用于表示预测结果的初步信息,包括特征索引、起始位置索引、结束位置索引、起始对数概率和结束对数概率
_PrelimPrediction = collections.namedtuple(
    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_log_prob", "end_log_prob"]
)

# 定义一个命名元组 `_NbestPrediction`,用于表示最终的预测结果,包括文本、起始对数概率和结束对数概率
_NbestPrediction = collections.namedtuple(
    "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]
)

# 记录器输出信息,指示将预测结果写入指定的 JSON 文件
logger.info(f"Writing predictions to: {output_prediction_file}")

# 创建一个 defaultdict,用于按照示例索引将特征对象分组存储
example_index_to_features = collections.defaultdict(list)
for feature in all_features:
    example_index_to_features[feature.example_index].append(feature)

# 创建一个字典,将每个结果对象按照其唯一标识存储
unique_id_to_result = {}
for result in all_results:
    unique_id_to_result[result.unique_id] = result

# 创建一个有序字典,用于存储所有的预测结果
all_predictions = collections.OrderedDict()
all_nbest_json = collections.OrderedDict()
scores_diff_json = collections.OrderedDict()

# 将所有预测结果写入指定的 JSON 文件中,格式化输出并换行
with open(output_prediction_file, "w") as writer:
    writer.write(json.dumps(all_predictions, indent=4) + "\n")

# 将所有最佳预测结果写入指定的 JSON 文件中,格式化输出并换行
with open(output_nbest_file, "w") as writer:
    writer.write(json.dumps(all_nbest_json, indent=4) + "\n")

# 如果设置了包含负面情况的版本标志,将分数差异信息写入指定的 JSON 文件中,格式化输出并换行
if version_2_with_negative:
    with open(output_null_log_odds_file, "w") as writer:
        writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

# 返回存储所有预测结果的有序字典
return all_predictions

.\data\metrics\__init__.py

# 导入警告模块,用于发出关于未来版本不推荐使用的警告信息
import warnings

# 从工具包中导入检查函数和后端依赖的函数
from ...utils import is_sklearn_available, requires_backends

# 如果检测到 sklearn 可用,则导入相关的指标函数
if is_sklearn_available():
    from scipy.stats import pearsonr, spearmanr
    from sklearn.metrics import f1_score, matthews_corrcoef

# 警告信息,指出当前指标将在未来版本中移除,推荐使用 Evaluate 库处理指标
DEPRECATION_WARNING = (
    "This metric will be removed from the library soon, metrics should be handled with the 🤗 Evaluate "
    "library. You can have a look at this example script for pointers: "
    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
)

# 计算简单准确率的函数定义,发出未来版本警告,并检查 sklearn 后端依赖
def simple_accuracy(preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
    requires_backends(simple_accuracy, "sklearn")
    return (preds == labels).mean()

# 计算准确率和 F1 分数的函数定义,发出未来版本警告,并检查 sklearn 后端依赖
def acc_and_f1(preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
    requires_backends(acc_and_f1, "sklearn")
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }

# 计算 Pearson 相关系数和 Spearman 秩相关系数的函数定义,发出未来版本警告,并检查 sklearn 后端依赖
def pearson_and_spearman(preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
    requires_backends(pearson_and_spearman, "sklearn")
    pearson_corr = pearsonr(preds, labels)[0]
    spearman_corr = spearmanr(preds, labels)[0]
    return {
        "pearson": pearson_corr,
        "spearmanr": spearman_corr,
        "corr": (pearson_corr + spearman_corr) / 2,
    }

# 计算 GLUE 任务中指标的函数定义,发出未来版本警告,并检查 sklearn 后端依赖
def glue_compute_metrics(task_name, preds, labels):
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
    requires_backends(glue_compute_metrics, "sklearn")
    assert len(preds) == len(labels), f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}"
    if task_name == "cola":
        return {"mcc": matthews_corrcoef(labels, preds)}
    elif task_name == "sst-2":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "mrpc":
        return acc_and_f1(preds, labels)
    elif task_name == "sts-b":
        return pearson_and_spearman(preds, labels)
    elif task_name == "qqp":
        return acc_and_f1(preds, labels)
    elif task_name == "mnli":
        return {"mnli/acc": simple_accuracy(preds, labels)}
    elif task_name == "mnli-mm":
        return {"mnli-mm/acc": simple_accuracy(preds, labels)}
    elif task_name == "qnli":
        return {"acc": simple_accuracy(preds, labels)}
    elif task_name == "rte":
        return {"acc": simple_accuracy(preds, labels)}
    # 如果任务名为 "wnli",则返回一个包含准确率的字典,使用 simple_accuracy 函数计算
    elif task_name == "wnli":
        return {"acc": simple_accuracy(preds, labels)}
    # 如果任务名为 "hans",则返回一个包含准确率的字典,使用 simple_accuracy 函数计算
    elif task_name == "hans":
        return {"acc": simple_accuracy(preds, labels)}
    # 如果任务名既不是 "wnli" 也不是 "hans",则抛出 KeyError 异常
    else:
        raise KeyError(task_name)
# 计算 xnli 任务的评估指标
def xnli_compute_metrics(task_name, preds, labels):
    # 发出警告,指示此函数即将弃用
    warnings.warn(DEPRECATION_WARNING, FutureWarning)
    # 确保需要的后端库被加载,这里是 sklearn
    requires_backends(xnli_compute_metrics, "sklearn")
    # 检查预测值和标签的长度是否一致,如果不一致则引发数值错误
    if len(preds) != len(labels):
        raise ValueError(f"Predictions and labels have mismatched lengths {len(preds)} and {len(labels)}")
    # 如果任务名为 "xnli",返回精度(accuracy)指标的字典
    if task_name == "xnli":
        return {"acc": simple_accuracy(preds, labels)}
    else:
        # 否则引发任务名错误
        raise KeyError(task_name)

.\data\processors\glue.py

# 设置文件编码为 UTF-8
# 版权声明,包括 Google AI Language Team 和 HuggingFace Inc. 团队的版权声明
# 版权声明,包括 NVIDIA CORPORATION 的版权声明
#
# 根据 Apache 许可证 2.0 版本使用本文件
# 除非符合许可证的要求,否则不得使用本文件
# 可以从以下链接获取许可证副本:
# http://www.apache.org/licenses/LICENSE-2.0
#
# 如果不符合适用法律或书面同意,则不得分发本软件
# 本软件基于"原样"提供,无任何明示或暗示的保证或条件
# 更多详细信息,请参阅许可证
""" GLUE processors and helpers"""

# 导入操作系统相关模块
import os
# 导入警告相关模块
import warnings
# 导入数据类相关模块
from dataclasses import asdict
# 导入枚举类型相关模块
from enum import Enum
# 导入列表、可选值和联合类型相关模块
from typing import List, Optional, Union

# 导入令牌化工具相关模块
from ...tokenization_utils import PreTrainedTokenizer
# 导入 TensorFlow 是否可用相关模块
from ...utils import is_tf_available, logging
# 导入数据处理器、输入示例、输入特征相关模块
from .utils import DataProcessor, InputExample, InputFeatures

# 如果 TensorFlow 可用,则导入 TensorFlow 模块
if is_tf_available():
    import tensorflow as tf

# 获取日志记录器
logger = logging.get_logger(__name__)

# 警告信息:此函数将很快从库中移除,预处理应使用 🤗 Datasets 库处理
DEPRECATION_WARNING = (
    "This {0} will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets "
    "library. You can have a look at this example script for pointers: "
    "https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py"
)

# 函数:将输入示例转换为特征列表
def glue_convert_examples_to_features(
    examples: Union[List[InputExample], "tf.data.Dataset"],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    """
    Loads a data file into a list of `InputFeatures`

    Args:
        examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length. Defaults to the tokenizer's max_len
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
        output_mode: String indicating the output mode. Either `regression` or `classification`

    Returns:
        If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the task-specific
        features. If the input is a list of `InputExamples`, will return a list of task-specific `InputFeatures` which
        can be fed to the model.

    """
    # 发出警告:此函数将很快从库中移除
    warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
    # 如果 TensorFlow 可用且输入示例为 tf.data.Dataset 类型,则调用对应的 TensorFlow 版本的转换函数
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        if task is None:
            raise ValueError("When calling glue_convert_examples_to_features from TF, the task parameter is required.")
        return _tf_glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
    # 调用一个函数来将示例转换为特征并返回结果
    return _glue_convert_examples_to_features(
        examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
    )
if is_tf_available():
    # 如果 TensorFlow 可用,则定义一个私有函数 _tf_glue_convert_examples_to_features
    def _tf_glue_convert_examples_to_features(
        examples: tf.data.Dataset,
        tokenizer: PreTrainedTokenizer,
        task=str,
        max_length: Optional[int] = None,
    ) -> tf.data.Dataset:
        """
        将示例转换为特征集合的 TensorFlow 数据集。

        Returns:
            包含特定任务特征的 `tf.data.Dataset` 对象。
        """
        # 根据任务选择对应的处理器
        processor = glue_processors[task]()
        # 转换示例为 TensorFlow 数据集格式,并使用处理器处理每个示例
        examples = [processor.tfds_map(processor.get_example_from_tensor_dict(example)) for example in examples]
        # 将处理后的示例转换为特征集合
        features = glue_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
        # 根据任务类型确定标签类型
        label_type = tf.float32 if task == "sts-b" else tf.int64

        def gen():
            # 生成器函数,为 TensorFlow 数据集生成特征和标签对
            for ex in features:
                d = {k: v for k, v in asdict(ex).items() if v is not None}
                label = d.pop("label")
                yield (d, label)

        input_names = tokenizer.model_input_names

        # 返回基于生成器的 TensorFlow 数据集对象
        return tf.data.Dataset.from_generator(
            gen,
            ({k: tf.int32 for k in input_names}, label_type),
            ({k: tf.TensorShape([None]) for k in input_names}, tf.TensorShape([])),
        )


def _glue_convert_examples_to_features(
    examples: List[InputExample],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    if max_length is None:
        max_length = tokenizer.model_max_length

    if task is not None:
        # 如果指定了任务,选择对应的处理器和标签列表
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info(f"Using label list {label_list} for task {task}")
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info(f"Using output mode {output_mode} for task {task}")

    # 构建标签映射字典
    label_map = {label: i for i, label in enumerate(label_list)}

    def label_from_example(example: InputExample) -> Union[int, float, None]:
        # 根据示例获取标签
        if example.label is None:
            return None
        if output_mode == "classification":
            return label_map[example.label]
        elif output_mode == "regression":
            return float(example.label)
        raise KeyError(output_mode)

    # 获取所有示例的标签列表
    labels = [label_from_example(example) for example in examples]

    # 使用 tokenizer 批量编码文本对
    batch_encoding = tokenizer(
        [(example.text_a, example.text_b) for example in examples],
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )

    features = []
    # 构建输入特征对象列表
    for i in range(len(examples)):
        inputs = {k: batch_encoding[k][i] for k in batch_encoding}
        feature = InputFeatures(**inputs, label=labels[i])
        features.append(feature)

    # 输出前五个示例的日志信息
    for i, example in enumerate(examples[:5]):
        logger.info("*** Example ***")
        logger.info(f"guid: {example.guid}")
        logger.info(f"features: {features[i]}")

    return features


class OutputMode(Enum):
    # 定义输出模式枚举类
    classification = "classification"
    regression = "regression"
class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 发出关于过时警告的警告消息
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从张量字典中获取示例
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["sentence1"].numpy().decode("utf-8"),
            tensor_dict["sentence2"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 获取训练集示例
        logger.info(f"LOOKING AT {os.path.join(data_dir, 'train.tsv')}")
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 获取开发集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 获取测试集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        # 返回数据集的标签列表
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        # 为训练、开发和测试集创建示例
        examples = []
        for i, line in enumerate(lines):
            if i == 0:
                continue
            guid = f"{set_type}-{i}"
            text_a = line[3]
            text_b = line[4]
            label = None if set_type == "test" else line[0]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class MnliProcessor(DataProcessor):
    """Processor for the MultiNLI data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 发出关于过时警告的警告消息
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从张量字典中获取示例
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["premise"].numpy().decode("utf-8"),
            tensor_dict["hypothesis"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 获取训练集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 获取匹配开发集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 获取匹配测试集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched")
    # 返回一个包含标签列表的字符串数组,作为基类方法的实现
    def get_labels(self):
        """See base class."""
        return ["contradiction", "entailment", "neutral"]

    # 根据给定的行列表和集合类型创建示例
    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        examples = []
        for i, line in enumerate(lines):
            # 跳过第一行,因为它通常包含标题而非数据行
            if i == 0:
                continue
            # 使用行的第一个字段与集合类型结合创建唯一标识符
            guid = f"{set_type}-{line[0]}"
            # 获取文本 A,通常在行的第 8 列
            text_a = line[8]
            # 获取文本 B,通常在行的第 9 列
            text_b = line[9]
            # 如果是测试集合的一部分,标签置为 None;否则使用行的最后一列作为标签
            label = None if set_type.startswith("test") else line[-1]
            # 创建一个 InputExample 实例并将其添加到示例列表中
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        # 返回创建的示例列表
        return examples
class MnliMismatchedProcessor(MnliProcessor):
    """Processor for the MultiNLI Mismatched data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        # 调用父类构造函数,并发出未来警告
        super().__init__(*args, **kwargs)
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 读取 dev_mismatched.tsv 文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_mismatched")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 读取 test_mismatched.tsv 文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched")


class ColaProcessor(DataProcessor):
    """Processor for the CoLA data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        # 调用父类构造函数,并发出未来警告
        super().__init__(*args, **kwargs)
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从 tensor 字典中提取数据并创建 InputExample 对象
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["sentence"].numpy().decode("utf-8"),
            None,
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 读取 train.tsv 文件并创建训练示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 读取 dev.tsv 文件并创建 dev 示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 读取 test.tsv 文件并创建 test 示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        # 返回标签列表 ["0", "1"]
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        # 根据 set_type 创建相应数据集的示例
        test_mode = set_type == "test"
        if test_mode:
            lines = lines[1:]  # 如果是测试模式,跳过表头
        text_index = 1 if test_mode else 3  # 确定文本在行中的索引
        examples = []
        for i, line in enumerate(lines):
            guid = f"{set_type}-{i}"  # 构建全局唯一标识符
            text_a = line[text_index]  # 获取文本 A
            label = None if test_mode else line[1]  # 获取标签(训练和验证集有标签,测试集没有)
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples


class Sst2Processor(DataProcessor):
    """Processor for the SST-2 data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        # 调用父类构造函数,并发出未来警告
        super().__init__(*args, **kwargs)
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从 tensor 字典中提取数据并创建 InputExample 对象
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["sentence"].numpy().decode("utf-8"),
            None,
            str(tensor_dict["label"].numpy()),
        )
    # 从数据目录中读取 train.tsv 文件并创建训练集的示例
    def get_train_examples(self, data_dir):
        """See base class."""
        # 调用内部方法 _read_tsv 读取 train.tsv 文件内容,并调用 _create_examples 创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    # 从数据目录中读取 dev.tsv 文件并创建开发集的示例
    def get_dev_examples(self, data_dir):
        """See base class."""
        # 调用内部方法 _read_tsv 读取 dev.tsv 文件内容,并调用 _create_examples 创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    # 从数据目录中读取 test.tsv 文件并创建测试集的示例
    def get_test_examples(self, data_dir):
        """See base class."""
        # 调用内部方法 _read_tsv 读取 test.tsv 文件内容,并调用 _create_examples 创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    # 返回数据集的标签,这里是二分类任务,标签为 ["0", "1"]
    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    # 根据给定的 lines 和数据集类型 set_type 创建示例
    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        examples = []
        # 确定文本在行数据中的索引,对于测试集来说是第二列(index 1),对于其他是第一列(index 0)
        text_index = 1 if set_type == "test" else 0
        # 遍历每一行数据
        for i, line in enumerate(lines):
            # 跳过表头行(第一行)
            if i == 0:
                continue
            # 每个示例有一个全局唯一的 ID
            guid = f"{set_type}-{i}"
            # 获取文本内容,如果是测试集则直接取第一列文本,否则取第一列文本和第二列标签
            text_a = line[text_index]
            label = None if set_type == "test" else line[1]
            # 创建 InputExample 对象并添加到示例列表中
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
class StsbProcessor(DataProcessor):
    """Processor for the STS-B data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 发出关于过时警告的警告
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从张量字典中创建输入示例对象
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["sentence1"].numpy().decode("utf-8"),
            tensor_dict["sentence2"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 从指定目录中读取训练集文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 从指定目录中读取开发集文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 从指定目录中读取测试集文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        # 返回数据集的标签列表
        return [None]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        examples = []
        for i, line in enumerate(lines):
            if i == 0:
                continue
            # 构建示例的唯一标识符
            guid = f"{set_type}-{line[0]}"
            text_a = line[7]  # 第一个文本字段
            text_b = line[8]  # 第二个文本字段
            label = None if set_type == "test" else line[-1]  # 标签,测试集时为None
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class QqpProcessor(DataProcessor):
    """Processor for the QQP data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 发出关于过时警告的警告
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从张量字典中创建输入示例对象
        return InputExample(
            tensor_dict["idx"].numpy(),
            tensor_dict["question1"].numpy().decode("utf-8"),
            tensor_dict["question2"].numpy().decode("utf-8"),
            str(tensor_dict["label"].numpy()),
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 从指定目录中读取训练集文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 从指定目录中读取开发集文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 从指定目录中读取测试集文件并创建示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        # 返回数据集的标签列表
        return ["0", "1"]
    # 定义一个方法用于创建训练、开发和测试集的样本示例
    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        # 根据传入的set_type确定是否为测试模式
        test_mode = set_type == "test"
        # 根据测试模式确定问题1和问题2在每行数据中的索引位置
        q1_index = 1 if test_mode else 3
        q2_index = 2 if test_mode else 4
        examples = []
        # 遍历所有行数据
        for i, line in enumerate(lines):
            # 跳过第一行(标题行)
            if i == 0:
                continue
            # 每个样本的全局唯一标识(guid)格式为"{set_type}-{line[0]}"
            guid = f"{set_type}-{line[0]}"
            try:
                # 获取问题1和问题2的文本内容
                text_a = line[q1_index]
                text_b = line[q2_index]
                # 如果是测试模式,标签为None;否则取出第5列作为标签
                label = None if test_mode else line[5]
            except IndexError:
                # 如果索引错误(行数据不足),跳过该行
                continue
            # 创建一个InputExample对象,并加入到examples列表中
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        # 返回创建的样本示例列表
        return examples
class QnliProcessor(DataProcessor):
    """Processor for the QNLI data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        # 调用父类的初始化方法,并发出未来警告
        super().__init__(*args, **kwargs)
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从张量字典中获取数据并创建输入示例
        return InputExample(
            tensor_dict["idx"].numpy(),  # 获取索引并转换为NumPy数组
            tensor_dict["question"].numpy().decode("utf-8"),  # 获取问题字符串并解码为UTF-8格式
            tensor_dict["sentence"].numpy().decode("utf-8"),  # 获取句子字符串并解码为UTF-8格式
            str(tensor_dict["label"].numpy()),  # 获取标签并转换为字符串
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 获取训练集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 获取验证集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 获取测试集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        # 返回标签列表
        return ["entailment", "not_entailment"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        examples = []
        for i, line in enumerate(lines):
            if i == 0:
                continue  # 跳过标题行
            guid = f"{set_type}-{line[0]}"  # 创建全局唯一ID
            text_a = line[1]  # 获取第一个文本
            text_b = line[2]  # 获取第二个文本
            label = None if set_type == "test" else line[-1]  # 如果是测试集则标签为空,否则为最后一列
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))  # 添加输入示例
        return examples


class RteProcessor(DataProcessor):
    """Processor for the RTE data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        # 调用父类的初始化方法,并发出未来警告
        super().__init__(*args, **kwargs)
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 从张量字典中获取数据并创建输入示例
        return InputExample(
            tensor_dict["idx"].numpy(),  # 获取索引并转换为NumPy数组
            tensor_dict["sentence1"].numpy().decode("utf-8"),  # 获取第一个句子并解码为UTF-8格式
            tensor_dict["sentence2"].numpy().decode("utf-8"),  # 获取第二个句子并解码为UTF-8格式
            str(tensor_dict["label"].numpy()),  # 获取标签并转换为字符串
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 获取训练集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 获取验证集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 获取测试集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        # 返回标签列表
        return ["entailment", "not_entailment"]
    # 创建用于训练、开发和测试集的示例
    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        # 初始化空列表来存储示例
        examples = []
        # 遍历输入的每一行数据
        for i, line in enumerate(lines):
            # 跳过第一行(通常是标题行)
            if i == 0:
                continue
            # 根据数据集类型和行索引创建全局唯一标识符
            guid = f"{set_type}-{line[0]}"
            # 获取第一列文本作为 text_a
            text_a = line[1]
            # 获取第二列文本作为 text_b
            text_b = line[2]
            # 如果是测试集,标签设为 None;否则使用行数据的最后一列作为标签
            label = None if set_type == "test" else line[-1]
            # 创建一个输入示例并添加到示例列表中
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        # 返回创建的示例列表
        return examples
class WnliProcessor(DataProcessor):
    """Processor for the WNLI data set (GLUE version)."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        warnings.warn(DEPRECATION_WARNING.format("processor"), FutureWarning)

    def get_example_from_tensor_dict(self, tensor_dict):
        """See base class."""
        # 根据张量字典创建输入示例对象
        return InputExample(
            tensor_dict["idx"].numpy(),  # 使用张量中的索引值并转换为 numpy 数组
            tensor_dict["sentence1"].numpy().decode("utf-8"),  # 将张量中的句子1数据转换为 UTF-8 编码字符串
            tensor_dict["sentence2"].numpy().decode("utf-8"),  # 将张量中的句子2数据转换为 UTF-8 编码字符串
            str(tensor_dict["label"].numpy()),  # 使用张量中的标签值并转换为字符串
        )

    def get_train_examples(self, data_dir):
        """See base class."""
        # 获取训练集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        # 获取开发集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        # 获取测试集示例
        return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

    def get_labels(self):
        """See base class."""
        # 返回标签列表
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training, dev and test sets."""
        examples = []
        for i, line in enumerate(lines):
            if i == 0:
                continue
            guid = f"{set_type}-{line[0]}"  # 使用数据集类型和行索引创建全局唯一标识符
            text_a = line[1]  # 获取第一个文本
            text_b = line[2]  # 获取第二个文本
            label = None if set_type == "test" else line[-1]  # 如果是测试集,标签设为 None;否则使用数据中的标签值
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


glue_tasks_num_labels = {
    "cola": 2,
    "mnli": 3,
    "mrpc": 2,
    "sst-2": 2,
    "sts-b": 1,
    "qqp": 2,
    "qnli": 2,
    "rte": 2,
    "wnli": 2,
}

glue_processors = {
    "cola": ColaProcessor,  # 对应的处理器类
    "mnli": MnliProcessor,  # 对应的处理器类
    "mnli-mm": MnliMismatchedProcessor,  # 对应的处理器类
    "mrpc": MrpcProcessor,  # 对应的处理器类
    "sst-2": Sst2Processor,  # 对应的处理器类
    "sts-b": StsbProcessor,  # 对应的处理器类
    "qqp": QqpProcessor,  # 对应的处理器类
    "qnli": QnliProcessor,  # 对应的处理器类
    "rte": RteProcessor,  # 对应的处理器类
    "wnli": WnliProcessor,  # 对应的处理器类,本身就是 WnliProcessor 类
}

glue_output_modes = {
    "cola": "classification",  # 输出模式为分类
    "mnli": "classification",  # 输出模式为分类
    "mnli-mm": "classification",  # 输出模式为分类
    "mrpc": "classification",  # 输出模式为分类
    "sst-2": "classification",  # 输出模式为分类
    "sts-b": "regression",  # 输出模式为回归
    "qqp": "classification",  # 输出模式为分类
    "qnli": "classification",  # 输出模式为分类
    "rte": "classification",  # 输出模式为分类
    "wnli": "classification",  # 输出模式为分类
}

.\data\processors\squad.py

# 版权声明和许可信息
#
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json  # 导入json模块
import os  # 导入os模块
from functools import partial  # 导入functools模块中的partial函数
from multiprocessing import Pool, cpu_count  # 导入multiprocessing模块中的Pool和cpu_count函数

import numpy as np  # 导入numpy库,并使用np作为别名
from tqdm import tqdm  # 从tqdm库中导入tqdm函数

from ...models.bert.tokenization_bert import whitespace_tokenize  # 从bert模型的tokenization_bert模块导入whitespace_tokenize函数
from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy  # 导入tokenization_utils_base模块中的BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy类
from ...utils import is_tf_available, is_torch_available, logging  # 从utils模块导入is_tf_available, is_torch_available, logging函数
from .utils import DataProcessor  # 从当前目录的utils模块中导入DataProcessor类

# 存储插入2个分隔符令牌的标记器集合
MULTI_SEP_TOKENS_TOKENIZERS_SET = {"roberta", "camembert", "bart", "mpnet"}

if is_torch_available():  # 如果torch可用
    import torch  # 导入torch库

    from torch.utils.data import TensorDataset  # 从torch.utils.data模块导入TensorDataset类

if is_tf_available():  # 如果tensorflow可用
    import tensorflow as tf  # 导入tensorflow库

logger = logging.get_logger(__name__)  # 获取当前模块的logger实例


def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
    """Returns tokenized answer spans that better match the annotated answer."""
    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))  # 使用tokenizer对原始答案文本进行分词

    for new_start in range(input_start, input_end + 1):  # 遍历起始和结束位置之间的所有可能起始位置
        for new_end in range(input_end, new_start - 1, -1):  # 从结束位置向前遍历到起始位置
            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])  # 根据新的起始和结束位置获取文本片段
            if text_span == tok_answer_text:  # 如果文本片段与tokenized答案文本匹配
                return (new_start, new_end)  # 返回新的起始和结束位置作为改进后的答案文本位置

    return (input_start, input_end)  # 如果找不到更好的匹配,返回原始的起始和结束位置


def _check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""
    best_score = None  # 初始化最佳分数
    best_span_index = None  # 初始化最佳span的索引
    for span_index, doc_span in enumerate(doc_spans):  # 遍历所有文档span的索引和文档span
        end = doc_span.start + doc_span.length - 1  # 计算span的结束位置
        if position < doc_span.start:  # 如果当前位置小于span的起始位置,则跳过
            continue
        if position > end:  # 如果当前位置大于span的结束位置,则跳过
            continue
        num_left_context = position - doc_span.start  # 计算当前位置左侧的上下文数量
        num_right_context = end - position  # 计算当前位置右侧的上下文数量
        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length  # 计算当前span的分数
        if best_score is None or score > best_score:  # 如果当前分数是最佳分数或者比最佳分数更高
            best_score = score  # 更新最佳分数
            best_span_index = span_index  # 更新最佳span的索引

    return cur_span_index == best_span_index  # 返回当前span索引是否是最佳span的索引


def _new_check_is_max_context(doc_spans, cur_span_index, position):
    """Check if this is the 'max context' doc span for the token."""
    # if len(doc_spans) == 1:
    # return True
    best_score = None  # 初始化最佳分数
    best_span_index = None  # 初始化最佳span的索引
    # 省略了部分未实现的代码,可能会返回True或False,根据具体情况而定
    # 遍历文档片段列表,获取每个片段的索引和内容
    for span_index, doc_span in enumerate(doc_spans):
        # 计算当前文档片段的结束位置
        end = doc_span["start"] + doc_span["length"] - 1
        # 如果当前位置在当前文档片段之前,继续下一个片段
        if position < doc_span["start"]:
            continue
        # 如果当前位置在当前文档片段之后,继续下一个片段
        if position > end:
            continue
        # 计算当前位置相对于文档片段起始位置的左侧上下文长度
        num_left_context = position - doc_span["start"]
        # 计算当前位置相对于文档片段结束位置的右侧上下文长度
        num_right_context = end - position
        # 计算当前片段的得分,考虑左右上下文和片段长度的加权
        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
        # 如果当前得分是最佳得分或者是第一个评分,更新最佳得分和最佳片段索引
        if best_score is None or score > best_score:
            best_score = score
            best_span_index = span_index

    # 返回当前片段索引是否等于最佳片段索引
    return cur_span_index == best_span_index
# 判断字符 c 是否为空白字符,包括空格、制表符、回车符、换行符和特定的不间断空白符(0x202F)
def _is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

# 将示例转换为特征集合
def squad_convert_example_to_features(
    example, max_seq_length, doc_stride, max_query_length, padding_strategy, is_training
):
    features = []
    
    # 如果是训练模式且示例不是不可能的情况
    if is_training and not example.is_impossible:
        # 获取答案的起始和结束位置
        start_position = example.start_position
        end_position = example.end_position
        
        # 如果在文本中找不到答案,则跳过该示例
        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
        if actual_text.find(cleaned_answer_text) == -1:
            logger.warning(f"Could not find answer: '{actual_text}' vs. '{cleaned_answer_text}'")
            return []

    # 映射表:tokenized 后的索引到原始 token 的索引
    tok_to_orig_index = []
    # 原始 token 的索引到 tokenized 后的索引
    orig_to_tok_index = []
    # 所有的文档 token
    all_doc_tokens = []
    
    # 遍历示例的每个 token
    for i, token in enumerate(example.doc_tokens):
        orig_to_tok_index.append(len(all_doc_tokens))
        
        # 根据不同的 tokenizer 类型进行 tokenization
        if tokenizer.__class__.__name__ in [
            "RobertaTokenizer",
            "LongformerTokenizer",
            "BartTokenizer",
            "RobertaTokenizerFast",
            "LongformerTokenizerFast",
            "BartTokenizerFast",
        ]:
            sub_tokens = tokenizer.tokenize(token, add_prefix_space=True)
        else:
            sub_tokens = tokenizer.tokenize(token)
        
        # 遍历 tokenization 后的每个子 token
        for sub_token in sub_tokens:
            tok_to_orig_index.append(i)
            all_doc_tokens.append(sub_token)

    # 如果是训练模式且示例不是不可能的情况
    if is_training and not example.is_impossible:
        # 确定答案在 tokenized 后的起始和结束位置
        tok_start_position = orig_to_tok_index[example.start_position]
        if example.end_position < len(example.doc_tokens) - 1:
            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
        else:
            tok_end_position = len(all_doc_tokens) - 1
        
        # 改进答案跨度
        (tok_start_position, tok_end_position) = _improve_answer_span(
            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
        )

    # spans 是一个空列表
    spans = []

    # 截断后的查询 token 序列
    truncated_query = tokenizer.encode(
        example.question_text, add_special_tokens=False, truncation=True, max_length=max_query_length
    )

    # 对于插入两个 SEP token 在 <context> & <question> 之间的 tokenizer,需要特殊处理添加 token 的 mask 计算
    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
    sequence_added_tokens = (
        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
    )
    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair

    # 文档 token 序列
    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):
        # 如果 span 数组乘以文档步幅小于所有文档标记的长度,则继续执行循环

        # 定义我们希望截断/填充的一侧和文本/配对的排序
        if tokenizer.padding_side == "right":
            # 如果填充在右侧,则将截断后的查询设置为 texts,文档标记为 pairs
            texts = truncated_query
            pairs = span_doc_tokens
            truncation = TruncationStrategy.ONLY_SECOND.value
        else:
            # 否则,将文档标记设置为 texts,截断后的查询设置为 pairs
            texts = span_doc_tokens
            pairs = truncated_query
            truncation = TruncationStrategy.ONLY_FIRST.value

        encoded_dict = tokenizer.encode_plus(  # 使用 tokenizer 编码文本对
            texts,
            pairs,
            truncation=truncation,  # 设置截断策略
            padding=padding_strategy,  # 使用指定的填充策略
            max_length=max_seq_length,  # 设置最大序列长度
            return_overflowing_tokens=True,  # 返回溢出的标记
            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,  # 设置步幅
            return_token_type_ids=True,  # 返回 token 类型 ID
        )

        paragraph_len = min(
            len(all_doc_tokens) - len(spans) * doc_stride,  # 段落长度限制
            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,  # 最大序列长度限制
        )

        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
            # 如果填充 token 在输入 IDs 中
            if tokenizer.padding_side == "right":
                # 如果填充在右侧,则获取非填充的 ID
                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
            else:
                # 如果填充在左侧,则找到最后一个填充 token 的位置
                last_padding_id_position = (
                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
                )
                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
        else:
            # 如果填充 token 不在输入 IDs 中,则所有 token 都是非填充的
            non_padded_ids = encoded_dict["input_ids"]

        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)  # 将非填充的 ID 转换为 token

        token_to_orig_map = {}
        for i in range(paragraph_len):
            # 创建 token 到原始文档标记索引的映射
            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]

        encoded_dict["paragraph_len"] = paragraph_len  # 记录段落长度
        encoded_dict["tokens"] = tokens  # 记录 tokens
        encoded_dict["token_to_orig_map"] = token_to_orig_map  # 记录 token 到原始文档标记的映射
        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
        encoded_dict["token_is_max_context"] = {}  # 记录 token 是否是最大上下文
        encoded_dict["start"] = len(spans) * doc_stride  # 记录起始位置
        encoded_dict["length"] = paragraph_len  # 记录段落长度

        spans.append(encoded_dict)  # 将编码后的字典添加到 spans 列表中

        if "overflowing_tokens" not in encoded_dict or (
            "overflowing_tokens" in encoded_dict and len(encoded_dict["overflowing_tokens"]) == 0
        ):
            # 如果没有溢出的 token,或者存在溢出的 token 且长度为 0,则跳出循环
            break
        span_doc_tokens = encoded_dict["overflowing_tokens"]  # 更新文档标记为溢出的 token
    # 遍历每一个文档片段的索引
    for doc_span_index in range(len(spans)):
        # 遍历当前文档片段中的段落长度
        for j in range(spans[doc_span_index]["paragraph_len"]):
            # 调用函数检查当前位置是否为最大上下文位置
            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
            # 根据填充方式确定当前 token 的索引位置
            index = (
                j
                if tokenizer.padding_side == "left"  # 如果填充在左侧,则直接使用 j 作为索引
                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j  # 如果填充在右侧,则加上查询和特殊标记的长度
            )
            # 记录当前 token 是否为最大上下文
            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
    for span in spans:
        # 对于每一个文本片段,执行以下操作:

        # 找到CLS标记的位置
        cls_index = span["input_ids"].index(tokenizer.cls_token_id)

        # p_mask: 用于标记不能作为答案的token(0表示可以作为答案)
        # 原始的TF实现也保留了分类标记(设为0)
        p_mask = np.ones_like(span["token_type_ids"])
        if tokenizer.padding_side == "right":
            # 如果padding在右侧,设置超出截断查询部分后的token为0
            p_mask[len(truncated_query) + sequence_added_tokens :] = 0
        else:
            # 如果padding在左侧,设置从右侧截断token直到超出截断查询部分为止的token为0
            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0

        # 找到所有的pad token的索引并将其标记为1
        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
        # 找到所有的特殊token的索引并将其标记为1
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
        ).nonzero()
        p_mask[pad_token_indices] = 1
        p_mask[special_token_indices] = 1

        # 将CLS标记的索引设为0,表示CLS标记可以用于不可能的答案
        p_mask[cls_index] = 0

        # 判断当前文本片段是否不可能有答案
        span_is_impossible = example.is_impossible
        start_position = 0
        end_position = 0
        if is_training and not span_is_impossible:
            # 对于训练集,如果文档片段不包含注释,丢弃该片段,因为无法预测。
            doc_start = span["start"]
            doc_end = span["start"] + span["length"] - 1
            out_of_span = False

            # 如果起始和结束位置不在文档片段范围内,则丢弃该片段
            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
                out_of_span = True

            if out_of_span:
                # 如果超出文档片段范围,则将起始和结束位置设为CLS标记的位置,标记为不可能有答案
                start_position = cls_index
                end_position = cls_index
                span_is_impossible = True
            else:
                if tokenizer.padding_side == "left":
                    doc_offset = 0
                else:
                    doc_offset = len(truncated_query) + sequence_added_tokens

                # 计算起始和结束位置相对于文档片段的偏移量
                start_position = tok_start_position - doc_start + doc_offset
                end_position = tok_end_position - doc_start + doc_offset

        # 将当前文本片段的特征添加到列表中
        features.append(
            SquadFeatures(
                span["input_ids"],
                span["attention_mask"],
                span["token_type_ids"],
                cls_index,
                p_mask.tolist(),
                example_index=0,  # 这里不设置unique_id和example_index,它们将在后续处理中设置。
                unique_id=0,
                paragraph_len=span["paragraph_len"],
                token_is_max_context=span["token_is_max_context"],
                tokens=span["tokens"],
                token_to_orig_map=span["token_to_orig_map"],
                start_position=start_position,
                end_position=end_position,
                is_impossible=span_is_impossible,
                qas_id=example.qas_id,
            )
        )
    return features
    # 定义全局变量 tokenizer,用于存储传入的 tokenizer 实例
    global tokenizer
    tokenizer = tokenizer_for_convert

# 将给定的示例列表转换为可以直接用作模型输入的特征列表。此函数依赖于特定模型,并利用 tokenizer 的多个特性来创建模型的输入。
# 参数:
#   examples: [`~data.processors.squad.SquadExample`] 的列表
#   tokenizer: [`PreTrainedTokenizer`] 的子类实例
#   max_seq_length: 输入的最大序列长度
#   doc_stride: 当上下文过大并被拆分为多个特征时使用的步幅
#   max_query_length: 查询的最大长度
#   is_training: 是否为模型训练创建特征,还是为评估创建特征
#   padding_strategy: 填充策略,默认为 "max_length"
#   return_dataset: 默认为 False。可以是 'pt' 或 'tf'。
#       如果为 'pt':返回一个 torch.data.TensorDataset
#       如果为 'tf':返回一个 tf.data.Dataset
#   threads: 多处理线程数
#   tqdm_enabled: 是否启用 tqdm 进度条,默认为 True
# 返回:
#   [`~data.processors.squad.SquadFeatures`] 的列表

# 示例:
# ```
# processor = SquadV2Processor()
# examples = processor.get_dev_examples(data_dir)
#
# features = squad_convert_examples_to_features(
#     examples=examples,
#     tokenizer=tokenizer,
#     max_seq_length=args.max_seq_length,
#     doc_stride=args.doc_stride,
#     max_query_length=args.max_query_length,
#     is_training=not evaluate,
# )
# ```
    ):
        # 如果 example_features 为空列表,则跳过当前循环
        if not example_features:
            continue
        # 遍历 example_features 列表中的每个元素
        for example_feature in example_features:
            # 设置 example_feature 的 example_index 属性为当前的 example_index 值
            example_feature.example_index = example_index
            # 设置 example_feature 的 unique_id 属性为当前的 unique_id 值
            example_feature.unique_id = unique_id
            # 将 example_feature 添加到 new_features 列表中
            new_features.append(example_feature)
            # 增加 unique_id 的值,用于下一个 example_feature 的 unique_id
            unique_id += 1
        # 增加 example_index 的值,用于下一个 example_features 的 example_index
        example_index += 1
    # 将 new_features 赋值给 features,更新 features 到新的特征列表
    features = new_features
    # 删除 new_features 列表,释放内存
    del new_features
    # 如果 return_dataset 等于 "pt"
    if return_dataset == "pt":
        # 检查是否有可用的 PyTorch 环境,如果没有则抛出 RuntimeError
        if not is_torch_available():
            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")

        # 将 features 中的各项属性转换为 PyTorch 的 Tensor 类型,并构建数据集
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)

        # 如果不是训练模式,则创建 TensorDataset
        if not is_training:
            # 创建包含 all_input_ids 大小范围的索引的 Tensor
            all_feature_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
            # 创建 TensorDataset 包含所有的 input_ids, attention_masks, token_type_ids, feature_index, cls_index, p_mask
            dataset = TensorDataset(
                all_input_ids, all_attention_masks, all_token_type_ids, all_feature_index, all_cls_index, all_p_mask
            )
        else:
            # 如果是训练模式,还需要包含 start_positions 和 end_positions
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            # 创建 TensorDataset 包含所有的 input_ids, attention_masks, token_type_ids, start_positions, end_positions, cls_index, p_mask, is_impossible
            dataset = TensorDataset(
                all_input_ids,
                all_attention_masks,
                all_token_type_ids,
                all_start_positions,
                all_end_positions,
                all_cls_index,
                all_p_mask,
                all_is_impossible,
            )

        # 返回 features 和构建好的 dataset
        return features, dataset
    else:
        # 如果 return_dataset 不等于 "pt",则直接返回 features 列表
        return features
    """
    Processor for the SQuAD data set. overridden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and
    version 2.0 of SQuAD, respectively.
    """
    
    train_file = None
    dev_file = None

    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
        # 如果不是评估模式,从张量字典中获取答案文本的第一个值并解码成 UTF-8 格式
        if not evaluate:
            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
            # 获取答案起始位置的第一个值
            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
            # 初始化答案列表
            answers = []
        else:
            # 如果是评估模式,从张量字典中获取所有答案起始位置和文本并解码成 UTF-8 格式,存放在字典列表中
            answers = [
                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
            ]

            answer = None
            answer_start = None

        # 返回 SquadExample 对象,包含问题ID、问题文本、上下文文本、答案文本等信息
        return SquadExample(
            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
            question_text=tensor_dict["question"].numpy().decode("utf-8"),
            context_text=tensor_dict["context"].numpy().decode("utf-8"),
            answer_text=answer,
            start_position_character=answer_start,
            title=tensor_dict["title"].numpy().decode("utf-8"),
            answers=answers,
        )

    def get_examples_from_dataset(self, dataset, evaluate=False):
        """
        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.

        Args:
            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
            evaluate: Boolean specifying if in evaluation mode or in training mode

        Returns:
            List of SquadExample

        Examples:

        ```
        >>> import tensorflow_datasets as tfds

        >>> dataset = tfds.load("squad")

        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
        ```"""

        # 根据评估模式选择数据集的子集(训练集或验证集)
        if evaluate:
            dataset = dataset["validation"]
        else:
            dataset = dataset["train"]

        examples = []
        # 遍历数据集中的每个张量字典,并将其转换为 SquadExample 对象,存入 examples 列表中
        for tensor_dict in tqdm(dataset):
            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))

        # 返回转换后的 SquadExample 对象列表
        return examples
    # 返回训练集示例,从指定的数据目录中获取数据文件。
    def get_train_examples(self, data_dir, filename=None):
        """
        Returns the training examples from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the training file has a different name than the original one
                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.

        """
        # 如果 data_dir 为 None,则设为空字符串
        if data_dir is None:
            data_dir = ""

        # 如果 self.train_file 为 None,则抛出数值错误
        if self.train_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")

        # 打开指定的训练数据文件,使用 utf-8 编码读取
        with open(
            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
        ) as reader:
            # 加载 JSON 数据并提取其 "data" 字段
            input_data = json.load(reader)["data"]
        # 使用提取的数据创建示例,并标识为训练集
        return self._create_examples(input_data, "train")

    # 返回开发集示例,从指定的数据目录中获取数据文件。
    def get_dev_examples(self, data_dir, filename=None):
        """
        Returns the evaluation example from the data directory.

        Args:
            data_dir: Directory containing the data files used for training and evaluating.
            filename: None by default, specify this if the evaluation file has a different name than the original one
                which is `dev-v1.1.json` and `dev-v2.0.json` for squad versions 1.1 and 2.0 respectively.
        """
        # 如果 data_dir 为 None,则设为空字符串
        if data_dir is None:
            data_dir = ""

        # 如果 self.dev_file 为 None,则抛出数值错误
        if self.dev_file is None:
            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")

        # 打开指定的开发数据文件,使用 utf-8 编码读取
        with open(
            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
        ) as reader:
            # 加载 JSON 数据并提取其 "data" 字段
            input_data = json.load(reader)["data"]
        # 使用提取的数据创建示例,并标识为开发集
        return self._create_examples(input_data, "dev")
    # 定义一个私有方法,用于根据输入数据和设置类型创建示例列表
    def _create_examples(self, input_data, set_type):
        # 根据设置类型确定是否为训练模式
        is_training = set_type == "train"
        # 初始化示例列表为空
        examples = []
        # 遍历输入数据中的每一个条目
        for entry in tqdm(input_data):
            # 获取条目的标题
            title = entry["title"]
            # 遍历条目中的每一个段落
            for paragraph in entry["paragraphs"]:
                # 获取段落的文本内容
                context_text = paragraph["context"]
                # 遍历段落中的每一个问答对
                for qa in paragraph["qas"]:
                    # 获取问答对的唯一标识符
                    qas_id = qa["id"]
                    # 获取问答对的问题文本
                    question_text = qa["question"]
                    # 初始化答案起始位置和答案文本为 None
                    start_position_character = None
                    answer_text = None
                    # 初始化答案列表为空
                    answers = []

                    # 检查问答对是否为不可能的情况(较少见的情况)
                    is_impossible = qa.get("is_impossible", False)
                    if not is_impossible:
                        # 如果不是不可能的情况,根据训练模式选择处理方式
                        if is_training:
                            # 如果是训练模式,获取第一个答案作为标准答案
                            answer = qa["answers"][0]
                            answer_text = answer["text"]
                            start_position_character = answer["answer_start"]
                        else:
                            # 如果不是训练模式,获取所有可能的答案列表
                            answers = qa["answers"]

                    # 创建一个新的 SquadExample 对象,并将其加入示例列表
                    example = SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        context_text=context_text,
                        answer_text=answer_text,
                        start_position_character=start_position_character,
                        title=title,
                        is_impossible=is_impossible,
                        answers=answers,
                    )
                    examples.append(example)
        # 返回创建好的示例列表
        return examples
class SquadV1Processor(SquadProcessor):
    train_file = "train-v1.1.json"
    dev_file = "dev-v1.1.json"



# 定义处理SQuAD V1.1数据集的处理器,继承自SquadProcessor类
class SquadV1Processor(SquadProcessor):
    # 训练数据文件名
    train_file = "train-v1.1.json"
    # 开发数据文件名
    dev_file = "dev-v1.1.json"



class SquadV2Processor(SquadProcessor):
    train_file = "train-v2.0.json"
    dev_file = "dev-v2.0.json"



# 定义处理SQuAD V2.0数据集的处理器,继承自SquadProcessor类
class SquadV2Processor(SquadProcessor):
    # 训练数据文件名
    train_file = "train-v2.0.json"
    # 开发数据文件名
    dev_file = "dev-v2.0.json"



class SquadExample:
    """
    A single training/test example for the Squad dataset, as loaded from disk.

    Args:
        qas_id: The example's unique identifier
        question_text: The question string
        context_text: The context string
        answer_text: The answer string
        start_position_character: The character position of the start of the answer
        title: The title of the example
        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
        is_impossible: False by default, set to True if the example has no possible answer.
    """

    def __init__(
        self,
        qas_id,
        question_text,
        context_text,
        answer_text,
        start_position_character,
        title,
        answers=[],
        is_impossible=False,
    ):
        self.qas_id = qas_id  # 唯一标识符
        self.question_text = question_text  # 问题文本
        self.context_text = context_text  # 上下文文本
        self.answer_text = answer_text  # 答案文本
        self.title = title  # 示例的标题
        self.is_impossible = is_impossible  # 是否不可能存在答案,默认为False
        self.answers = answers  # 答案及其起始位置,用于评估时使用,默认为空列表

        self.start_position, self.end_position = 0, 0

        doc_tokens = []  # 存储上下文文本的标记列表
        char_to_word_offset = []  # 字符到单词偏移量映射
        prev_is_whitespace = True

        # 根据空白字符分割文本,将不同的标记归属于原始位置
        for c in self.context_text:
            if _is_whitespace(c):  # 判断字符是否为空白字符的辅助函数
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        self.doc_tokens = doc_tokens  # 上下文文本的标记列表
        self.char_to_word_offset = char_to_word_offset  # 字符到单词偏移量映射

        # 仅在评估时,起始和结束位置才有值
        if start_position_character is not None and not is_impossible:
            self.start_position = char_to_word_offset[start_position_character]
            self.end_position = char_to_word_offset[
                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
            ]



class SquadFeatures:
    """
    Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
    [`~data.processors.squad.SquadExample`] using the
    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.



# 单个SQuAD示例的特征,用于模型输入。这些特征是特定于模型的,
# 可以从[`~data.processors.squad.SquadExample`]使用
# :method:*~transformers.data.processors.squad.squad_convert_examples_to_features*方法进行构建。
class SquadFeatures:
    pass  # 这里仅占位,没有额外的实现
    # 初始化函数,用于创建一个新的对象来存储输入特征和答案相关信息
    def __init__(
        self,
        input_ids,                    # 输入序列的token索引列表
        attention_mask,               # 避免在填充token索引上执行注意力计算的掩码
        token_type_ids,               # 指示输入中第一部分和第二部分的段落token索引
        cls_index,                    # CLS(分类)token的索引位置
        p_mask,                       # 用于标识可以作为答案和不可以作为答案的token的掩码
                                      # 为不可作为答案的token设置为1,可以作为答案的设置为0
        example_index,                # 示例的索引
        unique_id,                    # 特征的唯一标识符
        paragraph_len,                # 上下文段落的长度
        token_is_max_context,         # 布尔值列表,标识哪些token在此特征对象中具有最大上下文。
                                      # 如果一个token没有在此特征对象中具有最大上下文,则意味着另一个特征对象对该token有更多相关信息,应优先考虑那个特征对象。
        tokens,                       # 输入ids对应的token列表
        token_to_orig_map,            # token到原始文本的映射,用于识别答案
        start_position,               # 答案起始token索引
        end_position,                 # 答案结束token索引
        is_impossible,                # 标识答案是否不可行
        qas_id: str = None,           # 问题-答案对的唯一标识符(可选)
        encoding: BatchEncoding = None,  # 可选,存储使用快速分词器对齐方法的BatchEncoding
    ):
        self.input_ids = input_ids                # 初始化对象属性:输入ids
        self.attention_mask = attention_mask      # 初始化对象属性:注意力掩码
        self.token_type_ids = token_type_ids      # 初始化对象属性:段落token类型ids
        self.cls_index = cls_index                # 初始化对象属性:CLS token索引
        self.p_mask = p_mask                      # 初始化对象属性:答案标记掩码
        self.example_index = example_index        # 初始化对象属性:示例索引
        self.unique_id = unique_id                # 初始化对象属性:唯一标识符
        self.paragraph_len = paragraph_len        # 初始化对象属性:段落长度
        self.token_is_max_context = token_is_max_context  # 初始化对象属性:最大上下文标记
        self.tokens = tokens                      # 初始化对象属性:tokens
        self.token_to_orig_map = token_to_orig_map  # 初始化对象属性:token到原始文本的映射
        self.start_position = start_position      # 初始化对象属性:答案起始位置
        self.end_position = end_position          # 初始化对象属性:答案结束位置
        self.is_impossible = is_impossible        # 初始化对象属性:是否不可行的标记
        self.qas_id = qas_id                      # 初始化对象属性:问题-答案对的唯一标识符
        self.encoding = encoding                  # 初始化对象属性:BatchEncoding对象
class SquadResult:
    """
    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.

    Args:
        unique_id: The unique identifier corresponding to that example.
        start_logits: The logits corresponding to the start of the answer
        end_logits: The logits corresponding to the end of the answer
    """

    # 定义 SquadResult 类,用于存储 SQuAD 数据集上模型输出的结果
    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
        # 初始化实例属性
        self.start_logits = start_logits  # 存储回答开始位置的 logits
        self.end_logits = end_logits  # 存储回答结束位置的 logits
        self.unique_id = unique_id  # 存储唯一标识符

        # 如果提供了 start_top_index 参数,则初始化以下属性
        if start_top_index:
            self.start_top_index = start_top_index  # 存储开始位置的 top index
            self.end_top_index = end_top_index  # 存储结束位置的 top index
            self.cls_logits = cls_logits  # 存储对应的 cls logits

.\data\processors\utils.py

# 导入所需的库和模块:csv、dataclasses、json,以及从typing中导入List、Optional和Union
import csv
import dataclasses
import json
from dataclasses import dataclass
from typing import List, Optional, Union

# 导入日志记录工具,这里的logging来自于上层utils模块
from ...utils import is_tf_available, is_torch_available, logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)


@dataclass
class InputExample:
    """
    用于简单序列分类的单个训练/测试示例。

    Args:
        guid: 示例的唯一标识符。
        text_a: 字符串。第一个序列的未分词文本。对于单序列任务,只需指定此序列。
        text_b: (可选) 字符串。第二个序列的未分词文本。仅对序列对任务必须指定。
        label: (可选) 示例的标签。对于训练和开发示例应指定,但对于测试示例不应指定。
    """

    guid: str
    text_a: str
    text_b: Optional[str] = None
    label: Optional[str] = None

    def to_json_string(self):
        """将该实例序列化为JSON字符串。"""
        return json.dumps(dataclasses.asdict(self), indent=2) + "\n"


@dataclass(frozen=True)
class InputFeatures:
    """
    单个数据特征集合。属性名称与模型输入的相应名称相同。

    Args:
        input_ids: 序列标记在词汇表中的索引。
        attention_mask: 避免对填充标记索引执行注意力的掩码。
            掩码值选择在 `[0, 1]` 范围内:通常为 `1` 表示非MASKED的标记, `0` 表示MASKED的标记(填充)。
        token_type_ids: (可选) 段标记索引,指示输入的第一和第二部分。只有某些模型会使用。
        label: (可选) 输入对应的标签。对于分类问题为整数,对于回归问题为浮点数。
    """

    input_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label: Optional[Union[int, float]] = None

    def to_json_string(self):
        """将该实例序列化为JSON字符串。"""
        return json.dumps(dataclasses.asdict(self)) + "\n"


class DataProcessor:
    """用于序列分类数据集的数据转换器的基类。"""
    # 抽象方法,子类需要实现从一个包含 TensorFlow 张量的字典中获取一个示例
    def get_example_from_tensor_dict(self, tensor_dict):
        """
        Gets an example from a dict with tensorflow tensors.

        Args:
            tensor_dict: Keys and values should match the corresponding Glue
                tensorflow_dataset examples.
        """
        raise NotImplementedError()

    # 抽象方法,子类需要实现从训练数据目录中获取一个 [`InputExample`] 集合
    def get_train_examples(self, data_dir):
        """Gets a collection of [`InputExample`] for the train set."""
        raise NotImplementedError()

    # 抽象方法,子类需要实现从开发数据目录中获取一个 [`InputExample`] 集合
    def get_dev_examples(self, data_dir):
        """Gets a collection of [`InputExample`] for the dev set."""
        raise NotImplementedError()

    # 抽象方法,子类需要实现从测试数据目录中获取一个 [`InputExample`] 集合
    def get_test_examples(self, data_dir):
        """Gets a collection of [`InputExample`] for the test set."""
        raise NotImplementedError()

    # 抽象方法,子类需要实现获取数据集的标签列表
    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    # 方法用于将 TensorFlow 数据集的示例转换为 GLUE 数据集的正确格式
    def tfds_map(self, example):
        """
        Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. This method converts
        examples to the correct format.
        """
        # 如果数据集有多个标签,则将示例的标签转换为标签列表中对应的索引值
        if len(self.get_labels()) > 1:
            example.label = self.get_labels()[int(example.label)]
        return example

    @classmethod
    # 读取一个以制表符分隔的值文件(TSV 文件)
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
    # 单句分类数据集的通用处理器类,继承自DataProcessor基类
    class SingleSentenceClassificationProcessor(DataProcessor):
        
        """Generic processor for a single sentence classification data set."""

        def __init__(self, labels=None, examples=None, mode="classification", verbose=False):
            # 初始化处理器对象,设置标签、示例、处理模式和详细输出选项
            self.labels = [] if labels is None else labels
            self.examples = [] if examples is None else examples
            self.mode = mode
            self.verbose = verbose

        def __len__(self):
            # 返回处理器中示例的数量
            return len(self.examples)

        def __getitem__(self, idx):
            # 根据索引获取单个示例或切片,返回新的SingleSentenceClassificationProcessor对象
            if isinstance(idx, slice):
                return SingleSentenceClassificationProcessor(labels=self.labels, examples=self.examples[idx])
            return self.examples[idx]

        @classmethod
        def create_from_csv(
            cls, file_name, split_name="", column_label=0, column_text=1, column_id=None, skip_first_row=False, **kwargs
        ):
            # 从CSV文件创建处理器的类方法,返回新的处理器对象
            processor = cls(**kwargs)
            processor.add_examples_from_csv(
                file_name,
                split_name=split_name,
                column_label=column_label,
                column_text=column_text,
                column_id=column_id,
                skip_first_row=skip_first_row,
                overwrite_labels=True,
                overwrite_examples=True,
            )
            return processor

        @classmethod
        def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs):
            # 从示例文本或文本与标签列表创建处理器的类方法,返回新的处理器对象
            processor = cls(**kwargs)
            processor.add_examples(texts_or_text_and_labels, labels=labels)
            return processor

        def add_examples_from_csv(
            self,
            file_name,
            split_name="",
            column_label=0,
            column_text=1,
            column_id=None,
            skip_first_row=False,
            overwrite_labels=False,
            overwrite_examples=False,
        ):
            # 从CSV文件中读取行,并根据指定的列提取文本、标签和ID
            lines = self._read_tsv(file_name)
            if skip_first_row:
                lines = lines[1:]
            texts = []
            labels = []
            ids = []
            for i, line in enumerate(lines):
                texts.append(line[column_text])
                labels.append(line[column_label])
                if column_id is not None:
                    ids.append(line[column_id])
                else:
                    guid = f"{split_name}-{i}" if split_name else str(i)
                    ids.append(guid)

            return self.add_examples(
                texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples
            )

        def add_examples(
            self, texts_or_text_and_labels, labels=None, ids=None, overwrite_labels=False, overwrite_examples=False
        ):
            # 向处理器中添加示例文本、标签和可选ID,可以选择是否覆盖已有的标签和示例
    ):
        # 检查标签和文本或文本与标签列表的长度是否匹配,如果标签不为空且长度不一致则引发 ValueError 异常
        if labels is not None and len(texts_or_text_and_labels) != len(labels):
            raise ValueError(
                f"Text and labels have mismatched lengths {len(texts_or_text_and_labels)} and {len(labels)}"
            )
        # 检查 IDs 和文本或文本与标签列表的长度是否匹配,如果 IDs 不为空且长度不一致则引发 ValueError 异常
        if ids is not None and len(texts_or_text_and_labels) != len(ids):
            raise ValueError(f"Text and ids have mismatched lengths {len(texts_or_text_and_labels)} and {len(ids)}")
        # 如果 IDs 为空,则用 None 填充与文本或文本与标签列表长度相同的列表
        if ids is None:
            ids = [None] * len(texts_or_text_and_labels)
        # 如果标签为空,则用 None 填充与文本或文本与标签列表长度相同的列表
        if labels is None:
            labels = [None] * len(texts_or_text_and_labels)
        # 初始化空列表 examples,用于存储处理后的文本示例
        examples = []
        # 初始化集合 added_labels,用于存储添加的标签,确保每个标签只添加一次
        added_labels = set()
        # 遍历文本或文本与标签列表、标签列表和 IDs 列表的并行组合
        for text_or_text_and_label, label, guid in zip(texts_or_text_and_labels, labels, ids):
            # 如果文本或文本与标签是元组或列表且标签为空,则将其解包赋值给 text 和 label
            if isinstance(text_or_text_and_label, (tuple, list)) and label is None:
                text, label = text_or_text_and_label
            else:
                text = text_or_text_and_label
            # 将当前标签添加到 added_labels 集合中
            added_labels.add(label)
            # 创建一个 InputExample 实例,并将其添加到 examples 列表中
            examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label))

        # 更新 examples 列表
        if overwrite_examples:
            self.examples = examples
        else:
            self.examples.extend(examples)

        # 更新 labels 列表
        if overwrite_labels:
            self.labels = list(added_labels)
        else:
            # 将当前 labels 列表与 added_labels 集合的并集转换为列表,更新 self.labels
            self.labels = list(set(self.labels).union(added_labels))

        # 返回更新后的 examples 列表
        return self.examples

    def get_features(
        self,
        tokenizer,
        max_length=None,
        pad_on_left=False,
        pad_token=0,
        mask_padding_with_zero=True,
        return_tensors=None,

.\data\processors\xnli.py

# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" XNLI utils (dataset loading and evaluation)"""

import os

from ...utils import logging
from .utils import DataProcessor, InputExample

# 获取日志记录器实例
logger = logging.get_logger(__name__)

class XnliProcessor(DataProcessor):
    """
    Processor for the XNLI dataset. Adapted from
    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/run_classifier.py#L207
    """

    def __init__(self, language, train_language=None):
        # 初始化 XNLIProcessor 类的实例
        self.language = language
        self.train_language = train_language

    def get_train_examples(self, data_dir):
        """See base class."""
        # 如果没有指定 train_language,则使用 language
        lg = self.language if self.train_language is None else self.train_language
        # 读取并解析训练数据的每一行,从指定路径读取文件
        lines = self._read_tsv(os.path.join(data_dir, f"XNLI-MT-1.0/multinli/multinli.train.{lg}.tsv"))
        examples = []
        for i, line in enumerate(lines):
            if i == 0:
                continue
            # 创建全局唯一标识符,形如 "train-{i}"
            guid = f"train-{i}"
            # 第一列是文本 A
            text_a = line[0]
            # 第二列是文本 B
            text_b = line[1]
            # 第三列是标签,如果是 "contradictory" 则映射为 "contradiction"
            label = "contradiction" if line[2] == "contradictory" else line[2]
            # 确保 text_a 是字符串类型
            if not isinstance(text_a, str):
                raise ValueError(f"Training input {text_a} is not a string")
            # 确保 text_b 是字符串类型
            if not isinstance(text_b, str):
                raise ValueError(f"Training input {text_b} is not a string")
            # 确保 label 是字符串类型
            if not isinstance(label, str):
                raise ValueError(f"Training label {label} is not a string")
            # 创建 InputExample 对象并添加到 examples 列表中
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        # 返回构建的训练示例列表
        return examples
    # 从指定路径读取测试数据集的 TSV 文件并返回每行内容组成的列表
    lines = self._read_tsv(os.path.join(data_dir, "XNLI-1.0/xnli.test.tsv"))
    
    # 初始化一个空列表,用于存储处理后的样本数据
    examples = []
    
    # 遍历每行数据,i 是行索引,line 是行内容的列表
    for i, line in enumerate(lines):
        # 跳过第一行标题行
        if i == 0:
            continue
        
        # 获取语言标签,该数据集中位于行的第一个位置
        language = line[0]
        
        # 如果语言标签不等于当前实例的语言,则跳过此条数据
        if language != self.language:
            continue
        
        # 构建一个唯一的全局标识符,格式为 "test-索引"
        guid = f"test-{i}"
        
        # 获取第一个文本句子,位于行的第七个位置
        text_a = line[6]
        
        # 获取第二个文本句子,位于行的第八个位置
        text_b = line[7]
        
        # 获取标签信息,位于行的第二个位置
        label = line[1]
        
        # 如果 text_a 不是字符串类型,则引发数值错误异常
        if not isinstance(text_a, str):
            raise ValueError(f"Training input {text_a} is not a string")
        
        # 如果 text_b 不是字符串类型,则引发数值错误异常
        if not isinstance(text_b, str):
            raise ValueError(f"Training input {text_b} is not a string")
        
        # 如果 label 不是字符串类型,则引发数值错误异常
        if not isinstance(label, str):
            raise ValueError(f"Training label {label} is not a string")
        
        # 创建一个输入样本对象,并将其添加到 examples 列表中
        examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    
    # 返回处理后的样本列表
    return examples

# 返回一个包含可能的标签的列表,这些标签用于表示不同的语言推理结果
def get_labels(self):
    """See base class."""
    return ["contradiction", "entailment", "neutral"]
# 定义一个字典,将任务名称映射到处理器类上
xnli_processors = {
    "xnli": XnliProcessor,
}

# 定义一个字典,将任务名称映射到输出模式上,这里输出模式为分类
xnli_output_modes = {
    "xnli": "classification",
}

# 定义一个字典,将任务名称映射到标签数量上,这里 "xnli" 任务有 3 个标签
xnli_tasks_num_labels = {
    "xnli": 3,
}

.\data\processors\__init__.py

# 导入 HuggingFace 库中的各种模块和函数

from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels

.\data\__init__.py

# 导入数据收集器模块,包括多个特定用途的数据收集器类和函数
from .data_collator import (
    DataCollatorForLanguageModeling,  # 语言建模数据收集器
    DataCollatorForPermutationLanguageModeling,  # 排列语言建模数据收集器
    DataCollatorForSeq2Seq,  # 序列到序列数据收集器
    DataCollatorForSOP,  # SOP(句子顺序预测)数据收集器
    DataCollatorForTokenClassification,  # 标记分类数据收集器
    DataCollatorForWholeWordMask,  # 全词蒙版数据收集器
    DataCollatorWithPadding,  # 带填充功能的数据收集器
    DefaultDataCollator,  # 默认数据收集器类
    default_data_collator,  # 默认数据收集器函数
)

# 导入指标计算函数,用于 GLUE 和 XNLI 数据集的评估
from .metrics import glue_compute_metrics, xnli_compute_metrics

# 导入数据处理器和相关类、函数,用于处理输入样本和特征
from .processors import (
    DataProcessor,  # 数据处理器基类
    InputExample,  # 输入样本类
    InputFeatures,  # 输入特征类
    SingleSentenceClassificationProcessor,  # 单句分类任务处理器
    SquadExample,  # SQuAD 样本类
    SquadFeatures,  # SQuAD 特征类
    SquadV1Processor,  # SQuAD v1 处理器
    SquadV2Processor,  # SQuAD v2 处理器
    glue_convert_examples_to_features,  # 将 GLUE 样本转换为特征的函数
    glue_output_modes,  # GLUE 数据集的输出模式
    glue_processors,  # GLUE 数据集的处理器
    glue_tasks_num_labels,  # GLUE 任务对应的标签数量
    squad_convert_examples_to_features,  # 将 SQuAD 样本转换为特征的函数
    xnli_output_modes,  # XNLI 数据集的输出模式
    xnli_processors,  # XNLI 数据集的处理器
    xnli_tasks_num_labels,  # XNLI 任务对应的标签数量
)

.\debug_utils.py

# 导入collections模块,用于管理特定数据结构的集合
import collections

# 从当前目录下的utils模块中导入ExplicitEnum、is_torch_available和logging对象
from .utils import ExplicitEnum, is_torch_available, logging

# 如果torch可用,则导入torch模块
if is_torch_available():
    import torch

# 获取当前模块的日志记录器对象
logger = logging.get_logger(__name__)

# 定义一个调试类DebugUnderflowOverflow,用于检测和理解模型何时开始变得非常大或非常小,以及重要的nan或inf权重和激活元素
class DebugUnderflowOverflow:
    """
    This debug class helps detect and understand where the model starts getting very large or very small, and more
    importantly `nan` or `inf` weight and activation elements.

    There are 2 working modes:

    1. Underflow/overflow detection (default)
    2. Specific batch absolute min/max tracing without detection

    Mode 1: Underflow/overflow detection

    To activate the underflow/overflow detection, initialize the object with the model :

    ```
    debug_overflow = DebugUnderflowOverflow(model)
    ```

    then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or output
    elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this event,
    each frame reporting

    1. the fully qualified module name plus the class name whose `forward` was run
    2. the absolute min and max value of all elements for each module weights, and the inputs and output

    For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16
    mixed precision :

    ```
    Detected inf/nan during batch_number=0
    Last 21 forward frames:
    abs min  abs max  metadata
    [...]
                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
    2.17e-07 4.50e+00 weight
    1.79e-06 4.65e+00 input[0]
    2.68e-06 3.70e+01 output
                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
    8.08e-07 2.66e+01 weight
    1.79e-06 4.65e+00 input[0]
    1.27e-04 2.37e+02 output
                      encoder.block.2.layer.1.DenseReluDense.wo Linear
    1.01e-06 6.44e+00 weight
    0.00e+00 9.74e+03 input[0]
    3.18e-04 6.27e+04 output
                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
    1.79e-06 4.65e+00 input[0]
    3.18e-04 6.27e+04 output
                      encoder.block.2.layer.1.dropout Dropout
    3.18e-04 6.27e+04 input[0]
    0.00e+00      inf output
    ```

    You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was

    ```
    """
    # 在此类中定义初始化方法和两种工作模式的详细说明
    pass
    """
        As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the training
        down. Therefore remember to turn it off once the debugging needs have been met.
    
        Args:
            model (`nn.Module`):
                The model to debug.
            max_frames_to_save (`int`, *optional*, defaults to 21):
                How many frames back to record
            trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
                Which batch numbers to trace (turns detection off)
            abort_after_batch_num  (`int``, *optional*):
                Whether to abort after a certain batch number has finished
    """
    # 初始化函数,用于设置对象的初始状态和属性
    def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
        self.model = model  # 将传入的模型对象保存到实例属性中
        self.trace_batch_nums = trace_batch_nums  # 保存需要跟踪的批次号列表
        self.abort_after_batch_num = abort_after_batch_num  # 设置在哪个批次号之后终止运行

        # 创建一个LIFO(后进先出)的缓冲区,用于存储帧以便在遇到inf/nan时立即转储,以提供问题发生的上下文
        self.frames = collections.deque([], max_frames_to_save)  # 初始化一个空的固定大小的双端队列
        self.frame = []  # 初始化一个空的帧列表
        self.batch_number = 0  # 初始化批次号为0
        self.total_calls = 0  # 初始化总调用次数为0
        self.detected_overflow = False  # 初始化检测到溢出标志为False
        self.prefix = "                 "  # 初始化前缀字符串

        # 分析模型,可能是提取模型中模块的全限定名以便在运行时报告
        self.analyse_model()

        # 注册前向钩子(hook),用于在模型的前向传播过程中记录信息
        self.register_forward_hook()

    # 保存帧的方法,将当前帧内容转换为字符串并存入缓存队列中
    def save_frame(self, frame=None):
        if frame is not None:
            self.expand_frame(frame)  # 如果有指定帧,则扩展当前帧
        self.frames.append("\n".join(self.frame))  # 将当前帧转换为字符串并添加到帧缓存队列中
        self.frame = []  # 清空当前帧,以便开始新的帧记录

    # 扩展当前帧的方法,将一行文本添加到当前帧中
    def expand_frame(self, line):
        self.frame.append(line)  # 将一行文本添加到当前帧的末尾

    # 跟踪帧的方法,将所有缓存的帧内容打印出来
    def trace_frames(self):
        print("\n".join(self.frames))  # 打印所有缓存的帧内容
        self.frames = []  # 清空帧缓存队列

    # 重置保存的帧的方法,清空所有缓存的帧内容
    def reset_saved_frames(self):
        self.frames = []  # 清空帧缓存队列

    # 转储保存的帧的方法,打印检测到inf/nan时的批次号和最后保存的前向帧信息
    def dump_saved_frames(self):
        print(f"\nDetected inf/nan during batch_number={self.batch_number}")  # 打印检测到inf/nan时的批次号
        print(f"Last {len(self.frames)} forward frames:")  # 打印最后保存的前向帧数量
        print(f"{'abs min':8} {'abs max':8} metadata")  # 打印帧数据表头
        print("\n".join(self.frames))  # 打印所有缓存的前向帧内容
        print("\n\n")  # 打印额外的空行以分隔输出内容
        self.frames = []  # 清空帧缓存队列

    # 分析模型的方法,提取模型中模块的全限定名并保存到实例属性中
    def analyse_model(self):
        # 提取模型中所有模块的全限定名,保存为字典形式,键为模块名,值为全限定名
        self.module_names = {m: name for name, m in self.model.named_modules()}
        # self.longest_module_name = max(len(v) for v in self.module_names.values())

    # 分析变量的方法,根据变量类型进行不同的处理,如打印最大最小值或检测溢出
    def analyse_variable(self, var, ctx):
        if torch.is_tensor(var):  # 如果是张量
            self.expand_frame(get_abs_min_max(var, ctx))  # 获取张量的绝对最小值和最大值,并扩展当前帧
            if detect_overflow(var, ctx):  # 检测张量是否溢出
                self.detected_overflow = True  # 标记检测到溢出
        elif var is None:  # 如果变量为None
            self.expand_frame(f"{'None':>17} {ctx}")  # 扩展当前帧记录为"None"
        else:  # 变量不是张量也不是None
            self.expand_frame(f"{'not a tensor':>17} {ctx}")  # 扩展当前帧记录为"not a tensor"

    # 批次开始时记录帧的方法,记录批次号和开始信息到当前帧中
    def batch_start_frame(self):
        self.expand_frame(f"\n\n{self.prefix} *** Starting batch number={self.batch_number} ***")  # 扩展当前帧记录批次开始信息
        self.expand_frame(f"{'abs min':8} {'abs max':8} metadata")  # 扩展当前帧记录帧数据表头

    # 批次结束时记录帧的方法,记录批次号和结束信息到当前帧中
    def batch_end_frame(self):
        self.expand_frame(f"{self.prefix} *** Finished batch number={self.batch_number-1} ***\n\n")  # 扩展当前帧记录批次结束信息
    def create_frame(self, module, input, output):
        # 扩展调用栈帧,包括模块名称和类名
        self.expand_frame(f"{self.prefix} {self.module_names[module]} {module.__class__.__name__}")

        # 分析模块的参数
        for name, p in module.named_parameters(recurse=False):
            self.analyse_variable(p, name)

        # 分析输入变量
        if isinstance(input, tuple):
            for i, x in enumerate(input):
                self.analyse_variable(x, f"input[{i}]")
        else:
            self.analyse_variable(input, "input")

        # 分析输出变量
        if isinstance(output, tuple):
            for i, x in enumerate(output):
                # 如果输出是元组,进一步分析内部元素
                if isinstance(x, tuple):
                    for j, y in enumerate(x):
                        self.analyse_variable(y, f"output[{i}][{j}]")
                else:
                    self.analyse_variable(x, f"output[{i}]")
        else:
            self.analyse_variable(output, "output")

        # 保存当前帧信息
        self.save_frame()

    def register_forward_hook(self):
        # 对模型应用前向钩子
        self.model.apply(self._register_forward_hook)

    def _register_forward_hook(self, module):
        # 注册前向钩子到指定模块
        module.register_forward_hook(self.forward_hook)

    def forward_hook(self, module, input, output):
        # - input 是一个打包输入的元组(可能包含非张量)
        # - output 可能是一个张量或者张量和非张量的元组

        last_frame_of_batch = False

        # 判断是否在跟踪批次号中
        trace_mode = True if self.batch_number in self.trace_batch_nums else False
        if trace_mode:
            self.reset_saved_frames()

        # 如果是第一次调用,则开始一个新批次的帧
        if self.total_calls == 0:
            self.batch_start_frame()
        self.total_calls += 1

        # 如果模块是整个模型,增加批次号并标记为批次的最后一帧
        if module == self.model:
            self.batch_number += 1
            last_frame_of_batch = True

        # 创建调用帧
        self.create_frame(module, input, output)

        # 如果是批次的最后一帧,则执行批次结束帧操作
        if last_frame_of_batch:
            self.batch_start_frame()

        # 如果在跟踪模式中,追踪帧
        if trace_mode:
            self.trace_frames()

        # 如果检测到溢出或下溢,并且不在跟踪模式中,则转储保存的帧信息并抛出异常
        if self.detected_overflow and not trace_mode:
            self.dump_saved_frames()
            raise ValueError(
                "DebugUnderflowOverflow: inf/nan detected, aborting as there is no point running further. "
                "Please scroll up above this traceback to see the activation values prior to this event."
            )

        # 如果请求在特定批次之后中止,则抛出异常
        if self.abort_after_batch_num is not None and self.batch_number > self.abort_after_batch_num:
            raise ValueError(
                f"DebugUnderflowOverflow: aborting after {self.batch_number} batches due to"
                f" `abort_after_batch_num={self.abort_after_batch_num}` arg"
            )
# 计算变量的绝对值的最小值和最大值,并返回格式化的字符串
def get_abs_min_max(var, ctx):
    # 计算变量的绝对值
    abs_var = var.abs()
    # 返回格式化的字符串,包括绝对值的最小值和最大值,以及上下文信息 ctx
    return f"{abs_var.min():8.2e} {abs_var.max():8.2e} {ctx}"


# 检测张量变量中是否包含 `nan` 或 `inf` 条目,并打印相关消息
def detect_overflow(var, ctx):
    """
    Report whether the tensor contains any `nan` or `inf` entries.

    This is useful for detecting overflows/underflows and best to call right after the function that did some math that
    modified the tensor in question.

    This function contains a few other helper features that you can enable and tweak directly if you want to track
    various other things.

    Args:
        var: the tensor variable to check
        ctx: the message to print as a context

    Return:
        `True` if `inf` or `nan` was detected, `False` otherwise
    """
    detected = False
    # 检测是否存在 `nan` 条目,若存在则设置 detected 为 True 并打印包含 `nans` 的上下文信息 ctx
    if torch.isnan(var).any().item():
        detected = True
        print(f"{ctx} has nans")
    # 检测是否存在 `inf` 条目,若存在则设置 detected 为 True 并打印包含 `infs` 的上下文信息 ctx
    if torch.isinf(var).any().item():
        detected = True
        print(f"{ctx} has infs")

    # 如果需要监视大的元素,可以启用以下功能
    if 0:  # and detected:
        # 打印绝对值大于等于 100 的元素数量
        n100 = var[torch.ge(var.abs(), 100)]
        if n100.numel() > 0:
            print(f"{ctx}:  n100={n100.numel()}")
        # 打印绝对值大于等于 1000 的元素数量
        n1000 = var[torch.ge(var.abs(), 1000)]
        if n1000.numel() > 0:
            print(f"{ctx}: n1000={n1000.numel()}")
        # 打印绝对值大于等于 10000 的元素数量
        n10000 = var[torch.ge(var.abs(), 10000)]
        if n10000.numel() > 0:
            print(f"{ctx}: n10000={n10000.numel()}")

    # 如果需要打印最小值和最大值,可以启用以下功能
    if 0:
        print(f"min={var.min():9.2e} max={var.max():9.2e}")

    # 如果需要打印最小值、最大值、方差和均值,可以启用以下功能
    if 0:
        print(f"min={var.min():9.2e} max={var.max():9.2e} var={var.var():9.2e} mean={var.mean():9.2e} ({ctx})")

    # 返回是否检测到 `inf` 或 `nan` 的布尔值
    return detected


# 调试选项的枚举类,列出了一些调试选项
class DebugOption(ExplicitEnum):
    # 检测下溢和上溢
    UNDERFLOW_OVERFLOW = "underflow_overflow"
    # TPU 指标调试
    TPU_METRICS_DEBUG = "tpu_metrics_debug"
posted @ 2024-06-30 15:32  绝不原创的飞龙  阅读(250)  评论(0)    收藏  举报