Transformers-源码解析-一百二十三-

Transformers 源码解析（一百二十三）

`.\models\wav2vec2_conformer\init.py`

# 导入所需的依赖和模块
from typing import TYPE_CHECKING  # 导入类型检查模块

from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available  # 导入自定义工具函数和类


_import_structure = {
    "configuration_wav2vec2_conformer": [
        "WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",  # 预训练配置映射
        "Wav2Vec2ConformerConfig",  # Wav2Vec2Conformer 的配置类
    ],
}

# 检查是否有 torch 可用，如果不可用则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果 torch 可用，添加下列模块到导入结构中
    _import_structure["modeling_wav2vec2_conformer"] = [
        "WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",  # 预训练模型存档列表
        "Wav2Vec2ConformerForAudioFrameClassification",  # 用于音频帧分类的 Wav2Vec2Conformer 模型
        "Wav2Vec2ConformerForCTC",  # 用于 CTC 的 Wav2Vec2Conformer 模型
        "Wav2Vec2ConformerForPreTraining",  # 用于预训练的 Wav2Vec2Conformer 模型
        "Wav2Vec2ConformerForSequenceClassification",  # 用于序列分类的 Wav2Vec2Conformer 模型
        "Wav2Vec2ConformerForXVector",  # 用于 XVector 的 Wav2Vec2Conformer 模型
        "Wav2Vec2ConformerModel",  # Wav2Vec2Conformer 模型
        "Wav2Vec2ConformerPreTrainedModel",  # 预训练的 Wav2Vec2Conformer 模型
    ]

# 如果是类型检查模式，从相应的模块中导入特定的符号
if TYPE_CHECKING:
    from .configuration_wav2vec2_conformer import (
        WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,  # 预训练配置映射
        Wav2Vec2ConformerConfig,  # Wav2Vec2Conformer 的配置类
    )

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_wav2vec2_conformer import (
            WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,  # 预训练模型存档列表
            Wav2Vec2ConformerForAudioFrameClassification,  # 用于音频帧分类的 Wav2Vec2Conformer 模型
            Wav2Vec2ConformerForCTC,  # 用于 CTC 的 Wav2Vec2Conformer 模型
            Wav2Vec2ConformerForPreTraining,  # 用于预训练的 Wav2Vec2Conformer 模型
            Wav2Vec2ConformerForSequenceClassification,  # 用于序列分类的 Wav2Vec2Conformer 模型
            Wav2Vec2ConformerForXVector,  # 用于 XVector 的 Wav2Vec2Conformer 模型
            Wav2Vec2ConformerModel,  # Wav2Vec2Conformer 模型
            Wav2Vec2ConformerPreTrainedModel,  # 预训练的 Wav2Vec2Conformer 模型
        )

# 如果不是类型检查模式，将当前模块设为懒加载模块
else:
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\wav2vec2_phoneme\tokenization_wav2vec2_phoneme.py`

# 设置文件编码为UTF-8

# 版权声明，版权归Facebook Inc.和HuggingFace Inc.团队所有

# 引入必要的库和模块
import json  # 导入处理JSON格式的模块
import os  # 导入操作系统功能的模块
import sys  # 导入系统相关的模块
from dataclasses import dataclass  # 导入用于定义数据类的装饰器
from itertools import groupby  # 导入用于迭代操作的工具函数
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union  # 导入类型提示相关的工具函数

import numpy as np  # 导入处理数值数组的库

# 导入HuggingFace库中的相关工具和类
from ...tokenization_utils import PreTrainedTokenizer  # 导入预训练分词器的基类
from ...tokenization_utils_base import AddedToken  # 导入添加的特殊标记类
from ...utils import (  # 导入HuggingFace库中的一些实用工具函数和类
    ModelOutput,  # 导入模型输出的基类
    is_flax_available,  # 判断是否可以使用Flax库
    is_tf_available,  # 判断是否可以使用TensorFlow库
    is_torch_available,  # 判断是否可以使用PyTorch库
    logging,  # 日志记录工具
    requires_backends,  # 判断所需的后端库是否可用
    to_py_obj,  # 将对象转换为Python对象
)

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 如果类型检查开启，则根据当前可用的深度学习框架导入相应的库
if TYPE_CHECKING:
    if is_torch_available():
        import torch  # 导入PyTorch库
    if is_tf_available():
        import tensorflow as tf  # 导入TensorFlow库
    if is_flax_available():
        import jax.numpy as jnp  # 导入Flax库中的NumPy模块（忽略Flax库导入的警告）

# 定义词汇文件和分词器配置文件的名称映射
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.json",  # 词汇表文件名
    "tokenizer_config_file": "tokenizer_config.json",  # 分词器配置文件名
}

# 预训练模型的词汇文件映射，包括对应的下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "facebook/wav2vec2-lv-60-espeak-cv-ft": (
            "https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft/resolve/main/vocab.json"
        ),
    },
    "tokenizer_config_file": {
        "facebook/wav2vec2-lv-60-espeak-cv-ft": (
            "https://huggingface.co/facebook/wav2vec2-lv-60-espeak-cv-ft/resolve/main/tokenizer_config.json"
        ),
    },
}

# 预训练模型的位置编码嵌入大小映射，这里给出了一个特定模型的最大输入长度
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-lv-60-espeak-cv-ft": sys.maxsize}

# 定义一种数据类型，表示列表中包含字典的结构
ListOfDict = List[Dict[str, Union[int, str]]]

@dataclass
class Wav2Vec2PhonemeCTCTokenizerOutput(ModelOutput):
    """
    [`Wav2Vec2PhonemeCTCTokenizer`]的输出类型，带有音素。

    Args:
        text (list of `str` or `str`):
            解码的文本，通常是语音转录。
        char_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
            解码字符的偏移量。结合采样率和模型下采样率，可以用来计算每个字符的时间戳。
    """
    text: Union[List[str], str]  # 文本内容，可以是字符串或字符串列表
    char_offsets: Union[List[ListOfDict], ListOfDict] = None  # 字符的偏移量，可以是列表的列表或列表


class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
    """
    构造一个Wav2Vec2PhonemeCTC分词器。
    """
    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
    the superclass for more information regarding such methods.

    Args:
        vocab_file (`str`):
            File containing the vocabulary.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sentence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sentence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        do_phonemize (`bool`, *optional*, defaults to `True`):
            Whether the tokenizer should phonetize the input or not. Only if a sequence of phonemes is passed to the
            tokenizer, `do_phonemize` should be set to `False`.
        phonemizer_lang (`str`, *optional*, defaults to `"en-us"`):
            The language of the phoneme set to which the tokenizer should phonetize the input text to.
        phonemizer_backend (`str`, *optional*. defaults to `"espeak"`):
            The backend phonetization library that shall be used by the phonemizer library. Defaults to `espeak-ng`.
            See the [phonemizer package](https://github.com/bootphon/phonemizer#readme). for more information.

        **kwargs
            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
    ):
        self._word_delimiter_token = word_delimiter_token  # 设置单词分隔符令牌
        self._phone_delimiter_token = phone_delimiter_token  # 设置电话分隔符令牌
        self.do_phonemize = do_phonemize  # 是否执行音素化操作的标志
        self.phonemizer_lang = phonemizer_lang  # 音素化使用的语言
        self.phonemizer_backend = phonemizer_backend  # 音素化使用的后端

        if do_phonemize:
            self.init_backend(self.phonemizer_lang)  # 若需执行音素化，则初始化音素化后端

        with open(vocab_file, encoding="utf-8") as vocab_handle:
            self.encoder = json.load(vocab_handle)  # 从文件中加载编码器映射
        self.decoder = {v: k for k, v in self.encoder.items()}  # 创建解码器映射，反转编码器的键值对

        super().__init__(
            unk_token=unk_token,
            bos_token=bos_token,
            eos_token=eos_token,
            pad_token=pad_token,
            word_delimiter_token=word_delimiter_token,
            phone_delimiter_token=phone_delimiter_token,
            do_phonemize=do_phonemize,
            phonemizer_lang=phonemizer_lang,
            phonemizer_backend=phonemizer_backend,
            **kwargs,
        )  # 调用父类的初始化方法，传入相关参数

    @property
    def vocab_size(self) -> int:
        return len(self.decoder)  # 返回解码器的大小作为词汇表大小

    def get_vocab(self) -> Dict:
        vocab = dict(self.encoder.copy())  # 复制编码器的内容作为词汇表
        vocab.update(self.added_tokens_encoder)  # 添加额外的编码器映射
        return vocab  # 返回完整的词汇表

    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
        # 覆盖方法以避免去除空格！
        to_add = []
        for token in new_tokens:
            if isinstance(token, str):
                to_add.append(AddedToken(token, rstrip=False, lstrip=False, normalized=True, special=special_tokens))
            else:
                to_add.append(token)

        return super()._add_tokens(to_add, special_tokens)  # 调用父类的添加令牌方法，添加新令牌

    def init_backend(self, phonemizer_lang: str):
        """
        Initializes the backend.

        Args:
            phonemizer_lang (`str`): The language to be used.
        """
        requires_backends(self, "phonemizer")  # 检查必要的后端
        from phonemizer.backend import BACKENDS

        self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags")  # 初始化音素化后端

    def prepare_for_tokenization(
        self,
        text: str,
        is_split_into_words: bool = False,
        phonemizer_lang: Optional[str] = None,
        do_phonemize: Optional[bool] = None,
    ) -> Tuple[str, Dict[str, Any]]:
        """
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize. This is useful for NER or token classification.
            phonemizer_lang (`str`, *optional*):
                The language of the phoneme set to which the tokenizer should phonetize the input text to.
            do_phonemize (`bool`, *optional*):
                Whether the tokenizer should phonetize the input text or not. Only if a sequence of phonemes is passed
                to the tokenizer, `do_phonemize` should be set to `False`.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        """
        # If `is_split_into_words` is True, prepend a space to `text`
        if is_split_into_words:
            text = " " + text

        # Set the instance variable `self.do_phonemize` based on the provided `do_phonemize`
        if do_phonemize is not None:
            self.do_phonemize = do_phonemize

        # Set the instance variable `self.phonemizer_lang` and initialize backend if `phonemizer_lang` is provided
        if phonemizer_lang is not None:
            self.phonemizer_lang = phonemizer_lang
            self.init_backend(phonemizer_lang)

        # Return the modified `text` and an empty dictionary (unused kwargs)
        return (text, {})

    def _tokenize(self, text, **kwargs):
        """
        Converts a string into a sequence of tokens (string), using the tokenizer.
        """

        # Remove leading and trailing whitespace from `text`
        text = text.strip()

        # Phonemize the `text` if `self.do_phonemize` is True
        if self.do_phonemize:
            # Convert `text` to lowercase
            text = text.lower()

            # Generate a list of phonemes for the `text` in `self.phonemizer_lang`
            text = self.phonemize(text, self.phonemizer_lang)

        # Split `text` into tokens using whitespace as delimiter
        tokens = text.split(" ")

        # Remove empty tokens from the list
        tokens = list(filter(lambda p: p.strip() != "", tokens))

        # Return the list of tokens
        return tokens
    def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
        # 导入分离器模块
        from phonemizer.separator import Separator
        
        # 如果设置了单词分隔符标记，并且不为 None，则加上空格
        word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
        
        # 如果指定了语言且与当前使用的语言不同，则重新初始化后端
        if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
            self.init_backend(phonemizer_lang)
        else:
            # 否则使用当前的语言设置
            phonemizer_lang = self.phonemizer_lang
        
        # 创建分隔符对象，用于指定音素之间的分隔符
        separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
        
        # 对输入的文本进行音素化处理，返回一个包含音素的列表，取第一个元素并去除两端空白
        phonemes = self.backend.phonemize(
            [text],
            separator=separator,
        )
        phonemes = phonemes[0].strip()

        return phonemes

    @property
    def word_delimiter_token(self) -> str:
        """
        `str`: 单词分隔符标记。如果在尚未设置时使用，则记录错误日志。
        """
        if self._word_delimiter_token is None:
            if self.verbose:
                logger.error("Using word_delimiter_token, but it is not set yet.")
            return None
        return str(self._word_delimiter_token)

    @property
    def word_delimiter_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: 单词分隔符标记在词汇表中的ID。如果尚未设置，则返回 `None`。
        """
        if self._word_delimiter_token is None:
            return None
        return self.convert_tokens_to_ids(self.word_delimiter_token)

    @word_delimiter_token.setter
    def word_delimiter_token(self, value):
        # 设置单词分隔符标记的值
        self._word_delimiter_token = value

    @word_delimiter_token_id.setter
    def word_delimiter_token_id(self, value):
        # 根据给定的值将其转换为词汇表中的ID，并设置为单词分隔符标记
        self._word_delimiter_token = self.convert_tokens_to_ids(value)

    @property
    def phone_delimiter_token(self) -> str:
        """
        `str`: 音素分隔符标记。如果在尚未设置时使用，则记录错误日志。
        """
        if self._phone_delimiter_token is None:
            if self.verbose:
                logger.error("Using phone_delimiter_token, but it is not set yet.")
            return None
        return str(self._phone_delimiter_token)

    @property
    def phone_delimiter_token_id(self) -> Optional[int]:
        """
        `Optional[int]`: 音素分隔符标记在词汇表中的ID。如果尚未设置，则返回 `None`。
        """
        if self._phone_delimiter_token is None:
            return None
        return self.convert_tokens_to_ids(self.phone_delimiter_token)

    @phone_delimiter_token.setter
    def phone_delimiter_token(self, value):
        # 设置音素分隔符标记的值
        self._phone_delimiter_token = value

    @phone_delimiter_token_id.setter
    def phone_delimiter_token_id(self, value):
        # 根据给定的值将其转换为词汇表中的ID，并设置为音素分隔符标记
        self._phone_delimiter_token = self.convert_tokens_to_ids(value)

    def _convert_token_to_id(self, token: str) -> int:
        """将给定的 token（字符串）转换为索引（整数），使用词汇表进行映射。"""
        return self.encoder.get(token, self.encoder.get(self.unk_token))
    # 将索引转换为标记字符串，使用词汇表进行解码
    def _convert_id_to_token(self, index: int) -> str:
        """Converts an index (integer) into a token (str) using the vocabulary."""
        # 从解码器中获取索引对应的标记，如果不存在则使用未知标记（unk_token）
        result = self.decoder.get(index, self.unk_token)
        return result

    # 将连接主义时间分类（CTC）输出的标记列表转换为单个字符串
    def convert_tokens_to_string(
        self,
        tokens: List[str],
        group_tokens: bool = True,
        spaces_between_special_tokens: bool = False,
        filter_word_delimiter_token: bool = True,
        output_char_offsets: bool = False,
    ) -> str:
        """
        Converts a connectionist-temporal-classification (CTC) output tokens into a single string.
        """
        # 将相同的标记组合成非重复的标记，用于CTC风格的解码
        if group_tokens:
            # 使用itertools.groupby按标记分组，并记录每组的长度
            chars, char_repetitions = zip(*((token, len(list(group_iter))) for token, group_iter in groupby(tokens)))
        else:
            chars = tokens
            char_repetitions = len(tokens) * [1]

        # 过滤掉self.pad_token，这是用作CTC空白标记的特殊标记
        processed_chars = list(filter(lambda char: char != self.pad_token, chars))

        # 如果设置了过滤单词分隔符标记并且存在self.word_delimiter_token，则也过滤该标记
        if filter_word_delimiter_token and self.word_delimiter_token is not None:
            processed_chars = list(filter(lambda token: token != self.word_delimiter_token, processed_chars))

        # 如果需要输出字符偏移量，则计算偏移量
        char_offsets = None
        if output_char_offsets:
            # 计算字符偏移量，需要考虑CTC标记和单词分隔符标记
            word_delimiter_token_for_offsets = (
                self.word_delimiter_token if filter_word_delimiter_token is True else None
            )
            char_offsets = self._compute_offsets(
                char_repetitions, chars, self.pad_token, word_delimiter_token=word_delimiter_token_for_offsets
            )

            # 检查偏移量和处理后的标记列表长度是否一致
            if len(char_offsets) != len(processed_chars):
                raise ValueError(
                    f"`char_offsets`: {char_offsets} and `processed_tokens`: {processed_chars}"
                    " have to be of the same length, but are: `len(offsets)`: "
                    f"{len(char_offsets)} and `len(processed_tokens)`: {len(processed_chars)}"
                )

            # 将偏移量中的标记字段设置为正确的处理后的标记
            for i, char in enumerate(processed_chars):
                char_offsets[i]["char"] = char

        # 将处理后的标记列表连接成字符串，并去除首尾空格
        string = " ".join(processed_chars).strip()

        # 返回包含文本字符串和字符偏移量的字典
        return {"text": string, "char_offsets": char_offsets}

    # 计算字符偏移量的静态方法
    @staticmethod
    def _compute_offsets(
        char_repetitions: List[int], chars: List[str], ctc_token: int, word_delimiter_token: Optional[int] = None
    ):
    ) -> List[Dict[str, Union[str, int]]]:
        # 将字符重复次数数组转换为累积和数组，用于计算起始和结束索引
        end_indices = np.asarray(char_repetitions).cumsum()
        start_indices = np.concatenate(([0], end_indices[:-1]))

        # 根据字符、起始索引、结束索引创建偏移量字典列表
        offsets = [
            {"char": t, "start_offset": s, "end_offset": e} for t, s, e in zip(chars, start_indices, end_indices)
        ]

        # 过滤掉 CTC 标记的偏移量
        offsets = list(filter(lambda offsets: offsets["char"] != ctc_token, offsets))

        # 如果需要，过滤掉单词分隔符标记的偏移量
        if word_delimiter_token is not None:
            offsets = list(filter(lambda offsets: offsets["char"] != word_delimiter_token, offsets))

        # 返回偏移量列表
        return offsets

    def _decode(
        self,
        token_ids: List[int],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        group_tokens: bool = True,
        filter_word_delimiter_token: bool = True,
        spaces_between_special_tokens: bool = False,
        output_char_offsets: bool = False,
    ) -> str:
        """
        特殊的 _decode 函数用于 Wav2Vec2PhonemeTokenizer，因为添加的特殊标记应该与基础词汇表中的标记完全相同，
        因此必须在整个标记列表上调用 `convert_tokens_to_string` 函数，而不是单独处理添加的标记
        """
        # 将 token_ids 转换为 tokens 列表，跳过特殊标记（如果设置）
        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)

        result = []
        for token in filtered_tokens:
            # 如果设置了跳过特殊标记且 token 是特殊标记之一，则跳过该 token
            if skip_special_tokens and token in self.all_special_ids:
                continue
            result.append(token)

        # 将过滤后的 tokens 列表转换为字符串输出
        string_output = self.convert_tokens_to_string(
            result,
            group_tokens=group_tokens,
            spaces_between_special_tokens=spaces_between_special_tokens,
            filter_word_delimiter_token=filter_word_delimiter_token,
            output_char_offsets=output_char_offsets,
        )

        text = string_output["text"]

        # 如果需要清除标记化空格，则调用清除空格的函数
        clean_up_tokenization_spaces = (
            clean_up_tokenization_spaces
            if clean_up_tokenization_spaces is not None
            else self.clean_up_tokenization_spaces
        )
        if clean_up_tokenization_spaces:
            text = self.clean_up_tokenization(text)

        # 如果需要输出字符偏移量，则返回带偏移量的特定类型的对象
        if output_char_offsets:
            return Wav2Vec2PhonemeCTCTokenizerOutput(text=text, char_offsets=string_output["char_offsets"])
        else:
            return text

    # 重写自 `tokenization_utils_base.py`，因为这里需要文档说明 `output_char_offsets`
    def decode(
        self,
        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        output_char_offsets: bool = False,
        **kwargs,
    ) -> str:
        """
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces.
            output_char_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output character offsets. Character offsets can be used in combination with the
                sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.

                <Tip>

                Please take a look at the Example of [`~models.wav2vec2.tokenization_wav2vec2.decode`] to better
                understand how to make use of `output_word_offsets`.
                [`~model.wav2vec2_phoneme.tokenization_wav2vec2_phoneme.batch_decode`] works the same way with
                phonemes.

                </Tip>

            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str` or [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]: The decoded
            sentence. Will be a [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]
            when `output_char_offsets == True`.
        """
        # Convert inputs to python lists
        token_ids = to_py_obj(token_ids)

        # Call the internal _decode method with specified parameters
        return self._decode(
            token_ids=token_ids,
            skip_special_tokens=skip_special_tokens,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
            output_char_offsets=output_char_offsets,
            **kwargs,
        )

    # overwritten from `tokenization_utils_base.py` because tokenizer can output
    # `ModelOutput` which should not be a list for batched output and because
    # we need docs for `output_char_offsets` here
    def batch_decode(
        self,
        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: bool = None,
        output_char_offsets: bool = False,
        **kwargs,
    ):
        """
        Batch decodes sequences of token ids into strings or `ModelOutput` objects.

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List or batch of tokenized input sequences.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces.
            output_char_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output character offsets.

        Returns:
            `List[str]` or `List[~transformers.file_utils.ModelOutput]`: List of decoded sentences or model outputs.
        """
    ) -> List[str]:
        """
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces.
            output_char_offsets (`bool`, *optional*, defaults to `False`):
                Whether or not to output character offsets. Character offsets can be used in combination with the
                sampling rate and model downsampling rate to compute the time-stamps of transcribed characters.

                <Tip>

                Please take a look at the Example of [`~models.wav2vec2.tokenization_wav2vec2.decode`] to better
                understand how to make use of `output_word_offsets`.
                [`~model.wav2vec2_phoneme.tokenization_wav2vec2_phoneme.batch_decode`] works analogous with phonemes
                and batched output.

                </Tip>

            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `List[str]` or [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`]: The
            decoded sentence. Will be a
            [`~models.wav2vec2.tokenization_wav2vec2_phoneme.Wav2Vec2PhonemeCTCTokenizerOutput`] when
            `output_char_offsets == True`.
        """
        # Perform batch decoding using self.decode for each sequence in sequences
        batch_decoded = [
            self.decode(
                seq,
                skip_special_tokens=skip_special_tokens,
                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
                output_char_offsets=output_char_offsets,
                **kwargs,
            )
            for seq in sequences
        ]
        # Check if output_char_offsets is True
        if output_char_offsets:
            # Transform list of dictionaries to a dictionary of lists
            return Wav2Vec2PhonemeCTCTokenizerOutput({k: [d[k] for d in batch_decoded] for k in batch_decoded[0]})

        # Return the batch_decoded list
        return batch_decoded

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Check if save_directory exists; log an error and return if not
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        # Construct the full path for the vocabulary file
        vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        # Write the vocabulary (self.encoder) to the vocab_file in JSON format
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # Return the tuple containing the vocab_file path
        return (vocab_file,)

`.\models\wav2vec2_phoneme\init.py`

# 版权声明和许可证信息
# 版权声明，版权归 HuggingFace 团队所有，保留所有权利
# 根据 Apache 许可证版本 2.0 进行许可
# 除非符合许可证的要求，否则不得使用本文件
# 可以通过访问指定网址获得许可证的副本
#
# 如果适用法律要求或书面同意，软件将按"原样"分发
# 没有任何明示或暗示的保证或条件，包括但不限于
# 特定用途和适销性的保证。
# 有关详细信息，请参阅许可证内容。
from typing import TYPE_CHECKING

# 从 utils 模块中导入 _LazyModule 类
from ...utils import _LazyModule

# 定义需要延迟加载的模块结构
_import_structure = {"tokenization_wav2vec2_phoneme": ["Wav2Vec2PhonemeCTCTokenizer"]}

# 如果是类型检查阶段
if TYPE_CHECKING:
    # 从 tokenization_wav2vec2_phoneme 模块导入 Wav2Vec2PhonemeCTCTokenizer 类
    from .tokenization_wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
# 如果不是类型检查阶段
else:
    # 导入 sys 模块
    import sys

    # 将当前模块指定为 LazyModule 类的实例
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\wav2vec2_with_lm\processing_wav2vec2_with_lm.py`

# coding=utf-8
# 定义了代码文件的编码格式为 UTF-8

# 版权声明，指出该代码由 HuggingFace Inc. 团队编写
# 根据 Apache 许可证 2.0 版本发布
# 您可以在符合许可证条件下使用此文件，详见许可证文档
"""
Speech processor class for Wav2Vec2
"""
# 导入必要的库和模块
import os  # 导入操作系统功能模块
import warnings  # 导入警告处理模块
from contextlib import contextmanager, nullcontext  # 导入上下文管理器和空上下文
from dataclasses import dataclass  # 导入用于定义数据类的装饰器
from multiprocessing import Pool, get_context, get_start_method  # 导入多进程相关模块
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Union  # 导入类型提示相关模块

import numpy as np  # 导入 NumPy 库

# 导入相关的自定义模块和类
from ...processing_utils import ProcessorMixin
from ...utils import ModelOutput, logging, requires_backends  # 导入模型输出、日志和后端要求

# 获取日志记录器
logger = logging.get_logger(__name__)

# 如果是类型检查阶段
if TYPE_CHECKING:
    from pyctcdecode import BeamSearchDecoderCTC  # 导入 BeamSearchDecoderCTC 类

    from ...feature_extraction_utils import FeatureExtractionMixin  # 导入特征提取混合类
    from ...tokenization_utils import PreTrainedTokenizerBase  # 导入预训练分词器基类

# 定义一个列表字典类型的别名
ListOfDict = List[Dict[str, Union[int, str]]]


@dataclass
class Wav2Vec2DecoderWithLMOutput(ModelOutput):
    """
    Output type of [`Wav2Vec2DecoderWithLM`], with transcription.

    Args:
        text (list of `str` or `str`):
            Decoded logits in text from. Usually the speech transcription.
        logit_score (list of `float` or `float`):
            Total logit score of the beams associated with produced text.
        lm_score (list of `float`):
            Fused lm_score of the beams associated with produced text.
        word_offsets (list of `List[Dict[str, Union[int, str]]]` or `List[Dict[str, Union[int, str]]]`):
            Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
            can be used to compute time stamps for each word.
    """

    text: Union[List[List[str]], List[str], str]  # 文本结果，可以是列表的列表、列表或字符串形式
    logit_score: Union[List[List[float]], List[float], float] = None  # 对数得分，可以是列表的列表、列表或浮点数形式，默认为 None
    lm_score: Union[List[List[float]], List[float], float] = None  # 语言模型得分，可以是列表的列表、列表或浮点数形式，默认为 None
    word_offsets: Union[List[List[ListOfDict]], List[ListOfDict], ListOfDict] = None  # 单词偏移量，可以是列表的列表的列表、列表的列表或列表字典形式，默认为 None


class Wav2Vec2ProcessorWithLM(ProcessorMixin):
    r"""
    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
    with language model support into a single processor for language model boosted speech recognition decoding.
    """
    Args:
        feature_extractor ([`Wav2Vec2FeatureExtractor`]):
            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
        tokenizer ([`Wav2Vec2CTCTokenizer`]):
            An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
    """
    
    # 定义字符串常量，表示特征提取器和分词器的类名
    feature_extractor_class = "Wav2Vec2FeatureExtractor"
    tokenizer_class = "Wav2Vec2CTCTokenizer"

    def __init__(
        self,
        feature_extractor: "FeatureExtractionMixin",
        tokenizer: "PreTrainedTokenizerBase",
        decoder: "BeamSearchDecoderCTC",
    ):
        from pyctcdecode import BeamSearchDecoderCTC
        
        # 调用父类的初始化方法，传入特征提取器和分词器实例
        super().__init__(feature_extractor, tokenizer)
        
        # 检查解码器是否为正确的类型，若不是则抛出异常
        if not isinstance(decoder, BeamSearchDecoderCTC):
            raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")

        # 确保解码器的字母表与分词器的词汇表内容匹配
        missing_decoder_tokens = self.get_missing_alphabet_tokens(decoder, tokenizer)
        if len(missing_decoder_tokens) > 0:
            raise ValueError(
                f"The tokens {missing_decoder_tokens} are defined in the tokenizer's "
                "vocabulary, but not in the decoder's alphabet. "
                f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
            )

        # 将解码器、当前处理器和目标上下文管理器的初始状态设置为属性
        self.decoder = decoder
        self.current_processor = self.feature_extractor
        self._in_target_context_manager = False

    # 保存预训练模型至指定目录
    def save_pretrained(self, save_directory):
        super().save_pretrained(save_directory)  # 调用父类方法保存预训练模型
        self.decoder.save_to_dir(save_directory)  # 调用解码器的保存方法保存至指定目录

    # 设置语言模型属性的静态方法，用于设置解码器的模型属性
    @classmethod
    @staticmethod
    def _set_language_model_attribute(decoder: "BeamSearchDecoderCTC", attribute: str, value: float):
        setattr(decoder.model_container[decoder._model_key], attribute, value)

    # 返回解码器的语言模型属性作为属性方法
    @property
    def language_model(self):
        return self.decoder.model_container[self.decoder._model_key]

    @staticmethod
    def get_missing_alphabet_tokens(decoder, tokenizer):
        from pyctcdecode.alphabet import BLANK_TOKEN_PTN, UNK_TOKEN, UNK_TOKEN_PTN

        # 确保解码器的字母表中包含所有除特殊标记外的标记，检索缺失的字母表标记
        tokenizer_vocab_list = list(tokenizer.get_vocab().keys())

        # 替换特殊标记
        for i, token in enumerate(tokenizer_vocab_list):
            if BLANK_TOKEN_PTN.match(token):
                tokenizer_vocab_list[i] = ""
            if token == tokenizer.word_delimiter_token:
                tokenizer_vocab_list[i] = " "
            if UNK_TOKEN_PTN.match(token):
                tokenizer_vocab_list[i] = UNK_TOKEN

        # 检查哪些额外标记不是特殊的标记
        missing_tokens = set(tokenizer_vocab_list) - set(decoder._alphabet.labels)

        return missing_tokens

    def __call__(self, *args, **kwargs):
        """
        在普通模式下使用时，该方法将所有参数转发到Wav2Vec2FeatureExtractor的[`~Wav2Vec2FeatureExtractor.__call__`]，并返回其输出。
        如果在上下文[`~Wav2Vec2ProcessorWithLM.as_target_processor`]中使用，则将所有参数转发到Wav2Vec2CTCTokenizer的[`~Wav2Vec2CTCTokenizer.__call__`]。
        有关更多信息，请参阅上述两个方法的文档字符串。
        """
        # 为了向后兼容性
        if self._in_target_context_manager:
            return self.current_processor(*args, **kwargs)

        if "raw_speech" in kwargs:
            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
            audio = kwargs.pop("raw_speech")
        else:
            audio = kwargs.pop("audio", None)
        sampling_rate = kwargs.pop("sampling_rate", None)
        text = kwargs.pop("text", None)
        if len(args) > 0:
            audio = args[0]
            args = args[1:]

        if audio is None and text is None:
            raise ValueError("You need to specify either an `audio` or `text` input to process.")

        if audio is not None:
            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
        if text is not None:
            encodings = self.tokenizer(text, **kwargs)

        if text is None:
            return inputs
        elif audio is None:
            return encodings
        else:
            inputs["labels"] = encodings["input_ids"]
            return inputs
    # 定义一个方法 `pad`，用于数据填充
    def pad(self, *args, **kwargs):
        """
        When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods
        for more information.
        """
        # 如果在目标处理器的上下文中使用，则调用当前处理器的 `pad` 方法
        if self._in_target_context_manager:
            return self.current_processor.pad(*args, **kwargs)

        # 从 `kwargs` 中弹出 `input_features` 和 `labels` 参数
        input_features = kwargs.pop("input_features", None)
        labels = kwargs.pop("labels", None)

        # 如果有额外的位置参数，将第一个位置参数作为 `input_features`，其余作为 `args`
        if len(args) > 0:
            input_features = args[0]
            args = args[1:]

        # 如果 `input_features` 不为 `None`，使用特征提取器的 `pad` 方法进行填充
        if input_features is not None:
            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
        # 如果 `labels` 不为 `None`，使用标记器的 `pad` 方法进行填充
        if labels is not None:
            labels = self.tokenizer.pad(labels, **kwargs)

        # 根据是否有 `labels` 和 `input_features` 返回不同的结果
        if labels is None:
            return input_features
        elif input_features is None:
            return labels
        else:
            # 如果两者都有，将 `labels` 的 `input_ids` 添加到 `input_features` 的 `"labels"` 键中
            input_features["labels"] = labels["input_ids"]
            return input_features

    # 定义一个方法 `batch_decode`，用于批量解码 logits
    def batch_decode(
        self,
        logits: np.ndarray,
        pool: Optional[Pool] = None,
        num_processes: Optional[int] = None,
        beam_width: Optional[int] = None,
        beam_prune_logp: Optional[float] = None,
        token_min_logp: Optional[float] = None,
        hotwords: Optional[Iterable[str]] = None,
        hotword_weight: Optional[float] = None,
        alpha: Optional[float] = None,
        beta: Optional[float] = None,
        unk_score_offset: Optional[float] = None,
        lm_score_boundary: Optional[bool] = None,
        output_word_offsets: bool = False,
        n_best: int = 1,
    ):
        # 方法用于批量解码 logits 并返回结果
        pass

    # 定义一个方法 `decode`，用于解码 logits
    def decode(
        self,
        logits: np.ndarray,
        beam_width: Optional[int] = None,
        beam_prune_logp: Optional[float] = None,
        token_min_logp: Optional[float] = None,
        hotwords: Optional[Iterable[str]] = None,
        hotword_weight: Optional[float] = None,
        alpha: Optional[float] = None,
        beta: Optional[float] = None,
        unk_score_offset: Optional[float] = None,
        lm_score_boundary: Optional[bool] = None,
        output_word_offsets: bool = False,
        n_best: int = 1,
    ):
        # 方法用于解码 logits 并返回结果
        pass

    @contextmanager
    # 定义一个方法 `as_target_processor`，用于临时设置处理目标的处理器。在微调 Wav2Vec2 模型时，用于对标签进行编码。
    def as_target_processor(self):
        """
        Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning
        Wav2Vec2.
        """
        # 发出警告信息，提醒用户 `as_target_processor` 方法将在 Transformers v5 中移除，建议使用 `__call__` 方法的 `text` 参数处理标签。
        warnings.warn(
            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
            "your audio inputs, or in a separate call."
        )
        # 设置目标处理上下文管理器为真
        self._in_target_context_manager = True
        # 将当前处理器设置为分词器 tokenizer
        self.current_processor = self.tokenizer
        # 返回一个生成器，用于临时设置目标处理器
        yield
        # 在生成器中，将当前处理器设置为特征提取器 feature_extractor
        self.current_processor = self.feature_extractor
        # 设置目标处理上下文管理器为假，表示处理结束
        self._in_target_context_manager = False

`.\models\wav2vec2_with_lm\init.py`

# 版权声明和版权许可信息
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 引入类型检查模块的导入声明
from typing import TYPE_CHECKING

# 引入延迟加载模块工具
from ...utils import _LazyModule

# 定义模块的导入结构
_import_structure = {"processing_wav2vec2_with_lm": ["Wav2Vec2ProcessorWithLM"]}

# 如果是类型检查模式
if TYPE_CHECKING:
    # 从具体的子模块中导入 Wav2Vec2ProcessorWithLM 类型
    from .processing_wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
# 如果不是类型检查模式
else:
    # 导入 sys 模块
    import sys

    # 将当前模块替换为一个延迟加载模块，使用 LazyModule 类型
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\wavlm\configuration_wavlm.py`

# coding=utf-8
# Copyright 2021 The Fairseq Authors, Microsoft Research, and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
WavLM model configuration

This module contains the configuration class `WavLMConfig` which defines the model architecture
and inherits from `PretrainedConfig`.
"""

import functools
import operator

# Import logger from utils for logging purposes
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# Get logger instance for this module
logger = logging.get_logger(__name__)

# Mapping of pretrained model names to their configuration file URLs
WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "microsoft/wavlm-base": "https://huggingface.co/microsoft/wavlm-base/resolve/main/config.json",
    # See all WavLM models at https://huggingface.co/models?filter=wavlm
}


class WavLMConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`WavLMModel`]. It is used to instantiate an WavLM
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the WavLM
    [microsoft/wavlm-base](https://huggingface.co/microsoft/wavlm-base) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Example:

    ```

    ```

    Example:

    ```
    >>> from transformers import WavLMConfig, WavLMModel

    >>> # Initializing a WavLM facebook/wavlm-base-960h style configuration
    >>> configuration = WavLMConfig()

    >>> # Initializing a model (with random weights) from the facebook/wavlm-base-960h style configuration
    >>> model = WavLMModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
    
    # Specify the model type as "wavlm"
    model_type = "wavlm"
    # 初始化函数，用于创建一个新的对象实例，设置模型的各种参数
    def __init__(
        self,
        vocab_size=32,  # 词汇表大小，默认为32
        hidden_size=768,  # 隐藏层大小，默认为768
        num_hidden_layers=12,  # Transformer模型中的隐藏层数，默认为12
        num_attention_heads=12,  # 注意力头的数量，默认为12
        intermediate_size=3072,  # Transformer中间层的大小，默认为3072
        hidden_act="gelu",  # 隐藏层激活函数，默认为GELU
        hidden_dropout=0.1,  # 隐藏层的Dropout率，默认为0.1
        activation_dropout=0.1,  # 激活函数的Dropout率，默认为0.1
        attention_dropout=0.1,  # 注意力机制的Dropout率，默认为0.1
        feat_proj_dropout=0.0,  # 特征投影层的Dropout率，默认为0.0
        final_dropout=0.1,  # 最终层的Dropout率，默认为0.1
        layerdrop=0.1,  # LayerDrop的概率，默认为0.1
        initializer_range=0.02,  # 参数初始化范围，默认为0.02
        layer_norm_eps=1e-5,  # LayerNorm的epsilon值，默认为1e-5
        feat_extract_norm="group",  # 特征提取层的归一化方式，默认为"group"
        feat_extract_activation="gelu",  # 特征提取层的激活函数，默认为GELU
        conv_dim=(512, 512, 512, 512, 512, 512, 512),  # 卷积层的通道数，默认为(512, 512, 512, 512, 512, 512, 512)
        conv_stride=(5, 2, 2, 2, 2, 2, 2),  # 卷积层的步幅，默认为(5, 2, 2, 2, 2, 2, 2)
        conv_kernel=(10, 3, 3, 3, 3, 2, 2),  # 卷积层的核大小，默认为(10, 3, 3, 3, 3, 2, 2)
        conv_bias=False,  # 卷积层是否使用偏置，默认为False
        num_conv_pos_embeddings=128,  # 卷积位置嵌入的数量，默认为128
        num_conv_pos_embedding_groups=16,  # 卷积位置嵌入的组数，默认为16
        num_buckets=320,  # 桶的数量，默认为320
        max_bucket_distance=800,  # 桶之间的最大距离，默认为800
        do_stable_layer_norm=False,  # 是否使用稳定的LayerNorm，默认为False
        apply_spec_augment=True,  # 是否应用音频增强，默认为True
        mask_time_prob=0.05,  # 时间掩码的概率，默认为0.05
        mask_time_length=10,  # 时间掩码的长度，默认为10
        mask_time_min_masks=2,  # 时间掩码的最小数量，默认为2
        mask_feature_prob=0.0,  # 特征掩码的概率，默认为0.0
        mask_feature_length=10,  # 特征掩码的长度，默认为10
        num_codevectors_per_group=320,  # 每组码向量的数量，默认为320
        num_codevector_groups=2,  # 码向量组的数量，默认为2
        contrastive_logits_temperature=0.1,  # 对比损失的温度参数，默认为0.1
        num_negatives=100,  # 负样本的数量，默认为100
        codevector_dim=256,  # 码向量的维度，默认为256
        proj_codevector_dim=256,  # 投影码向量的维度，默认为256
        diversity_loss_weight=0.1,  # 多样性损失的权重，默认为0.1
        ctc_loss_reduction="mean",  # CTC损失的减少方式，默认为"mean"
        ctc_zero_infinity=False,  # CTC损失中是否使用零无穷，默认为False
        use_weighted_layer_sum=False,  # 是否使用加权层求和，默认为False
        classifier_proj_size=256,  # 分类器投影层的大小，默认为256
        tdnn_dim=(512, 512, 512, 512, 1500),  # TDNN层的通道数，默认为(512, 512, 512, 512, 1500)
        tdnn_kernel=(5, 3, 3, 1, 1),  # TDNN层的核大小，默认为(5, 3, 3, 1, 1)
        tdnn_dilation=(1, 2, 3, 1, 1),  # TDNN层的扩张率，默认为(1, 2, 3, 1, 1)
        xvector_output_dim=512,  # x-vector的输出维度，默认为512
        num_ctc_classes=80,  # CTC输出的类别数，默认为80
        pad_token_id=0,  # 填充标记的ID，默认为0
        bos_token_id=1,  # 开始标记的ID，默认为1
        eos_token_id=2,  # 结束标记的ID，默认为2
        add_adapter=False,  # 是否添加适配器，默认为False
        adapter_kernel_size=3,  # 适配器的核大小，默认为3
        adapter_stride=2,  # 适配器的步幅，默认为2
        num_adapter_layers=3,  # 适配器的层数，默认为3
        output_hidden_size=None,  # 输出层的隐藏大小，默认为None
        **kwargs,  # 其他关键字参数
    ):
        # 属性：输入到logits比例
        @property
        def inputs_to_logits_ratio(self):
            # 计算输入到logits的比例，即卷积步幅的乘积
            return functools.reduce(operator.mul, self.conv_stride, 1)

`.\models\wavlm\convert_wavlm_original_pytorch_checkpoint_to_pytorch.py`

# coding=utf-8
# 声明文件编码格式为UTF-8

# Copyright 2021 The HuggingFace Inc. team.
# 版权声明，指出代码版权属于HuggingFace团队，日期为2021年

# Licensed under the Apache License, Version 2.0 (the "License");
# 根据Apache许可证2.0版（"许可证"）授权使用本文件

# you may not use this file except in compliance with the License.
# 除非遵守许可证，否则不得使用此文件

# You may obtain a copy of the License at
# 您可以在以下网址获取许可证的副本

# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0的许可证文本

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# 除非适用法律要求或书面同意，否则根据许可证分发的软件是按"原样"分发的，
# 没有任何明示或暗示的担保或条件

# See the License for the specific language governing permissions and
# limitations under the License.
# 查看许可证以了解特定语言的权限和限制

"""Convert WavLM checkpoint."""
# 脚本用于转换WavLM检查点

import argparse
# 导入命令行参数解析模块

import torch
# 导入PyTorch库

# Step 1. clone https://github.com/microsoft/unilm
# 步骤1：克隆https://github.com/microsoft/unilm

# Step 2. git checkout to https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd
# 步骤2：切换到https://github.com/microsoft/unilm/commit/b94ec76c36f02fb2b0bf0dcb0b8554a2185173cd

# Step 3. cd unilm
# 步骤3：进入unilm目录

# Step 4. ln -s $(realpath wavlm/modules.py) ./  # create simlink
# 步骤4：创建符号链接指向realpath wavlm/modules.py

# import classes
# 导入自定义类

from unilm.wavlm.WavLM import WavLM as WavLMOrig
# 从unilm.wavlm.WavLM模块中导入WavLM类，并重命名为WavLMOrig

from unilm.wavlm.WavLM import WavLMConfig as WavLMConfigOrig
# 从unilm.wavlm.WavLM模块中导入WavLMConfig类，并重命名为WavLMConfigOrig

from transformers import WavLMConfig, WavLMModel, logging
# 从transformers库中导入WavLMConfig、WavLMModel类和logging模块

logging.set_verbosity_info()
# 设置日志记录级别为信息级别

logger = logging.get_logger(__name__)
# 获取当前模块的日志记录器对象

MAPPING = {
    "post_extract_proj": "feature_projection.projection",
    "encoder.pos_conv.0": "encoder.pos_conv_embed.conv",
    "self_attn.k_proj": "encoder.layers.*.attention.k_proj",
    "self_attn.v_proj": "encoder.layers.*.attention.v_proj",
    "self_attn.q_proj": "encoder.layers.*.attention.q_proj",
    "self_attn.out_proj": "encoder.layers.*.attention.out_proj",
    "self_attn.grep_linear": "encoder.layers.*.attention.gru_rel_pos_linear",
    "self_attn.relative_attention_bias": "encoder.layers.*.attention.rel_attn_embed",
    "self_attn.grep_a": "encoder.layers.*.attention.gru_rel_pos_const",
    "self_attn_layer_norm": "encoder.layers.*.layer_norm",
    "fc1": "encoder.layers.*.feed_forward.intermediate_dense",
    "fc2": "encoder.layers.*.feed_forward.output_dense",
    "final_layer_norm": "encoder.layers.*.final_layer_norm",
    "encoder.layer_norm": "encoder.layer_norm",
    "w2v_model.layer_norm": "feature_projection.layer_norm",
    "quantizer.weight_proj": "quantizer.weight_proj",
    "quantizer.vars": "quantizer.codevectors",
    "project_q": "project_q",
    "final_proj": "project_hid",
    "w2v_encoder.proj": "ctc_proj",
    "mask_emb": "masked_spec_embed",
}
# 映射表，将原始模型的参数路径映射到转换后模型的参数路径

TOP_LEVEL_KEYS = [
    "ctc_proj",
    "quantizer.weight_proj",
    "quantizer.codevectors",
    "project_q",
    "project_hid",
]
# 最顶层的关键字列表，这些关键字需要特殊处理

def set_recursively(hf_pointer, key, value, full_name, weight_type):
    # 递归设置函数，用于设置模型参数

    for attribute in key.split("."):
        # 遍历key中的属性列表
        hf_pointer = getattr(hf_pointer, attribute)
        # 获取hf_pointer对象中的对应属性

    if weight_type is not None:
        hf_shape = getattr(hf_pointer, weight_type).shape
        # 如果存在权重类型，则获取该类型的形状
    else:
        hf_shape = hf_pointer.shape
        # 否则获取hf_pointer的形状

    assert hf_shape == value.shape, (
        f"Shape of hf {key + '.' + weight_type if weight_type is not None else ''} is {hf_shape}, but should be"
        f" {value.shape} for {full_name}"
    )
    # 断言，确保hf_pointer的形状与value的形状相匹配，否则输出错误信息
    if weight_type == "weight":
        # 如果权重类型是 "weight"，则将数值赋给模型的权重数据
        hf_pointer.weight.data = value
    elif weight_type == "weight_g":
        # 如果权重类型是 "weight_g"，则将数值赋给模型的梯度权重数据
        hf_pointer.weight_g.data = value
    elif weight_type == "weight_v":
        # 如果权重类型是 "weight_v"，则将数值赋给模型的版本权重数据
        hf_pointer.weight_v.data = value
    elif weight_type == "bias":
        # 如果权重类型是 "bias"，则将数值赋给模型的偏置数据
        hf_pointer.bias.data = value
    else:
        # 如果没有特定的权重类型匹配，则直接将数值赋给模型指针的数据
        hf_pointer.data = value

    # 记录信息日志，指示哪些参数被初始化，并显示完整的名称路径
    logger.info(f"{key + '.' + weight_type if weight_type is not None else ''} was initialized from {full_name}.")
# 递归加载权重函数，从Fairseq模型加载到Hugging Face模型
def recursively_load_weights(fairseq_model, hf_model):
    # 未使用的权重列表
    unused_weights = []
    # 获取Fairseq模型的状态字典
    fairseq_dict = fairseq_model.state_dict()

    # 获取Hugging Face模型的特征提取器
    feature_extractor = hf_model.feature_extractor

    # 遍历Fairseq模型状态字典中的每个键值对
    for name, value in fairseq_dict.items():
        # 标记此权重是否被使用过，默认为未使用
        is_used = False

        # 如果名称中包含"conv_layers"
        if "conv_layers" in name:
            # 调用加载卷积层的函数
            load_conv_layer(
                name,
                value,
                feature_extractor,
                unused_weights,
                hf_model.config.feat_extract_norm == "group",
            )
            # 标记此权重已被使用
            is_used = True
        else:
            # 遍历映射表中的每个键值对
            for key, mapped_key in MAPPING.items():
                # 如果映射表中的键在名称中或者以"w2v_model."结尾的键在名称的首部
                if key in name or key.split("w2v_model.")[-1] == name.split(".")[0]:
                    # 标记此权重已被使用
                    is_used = True
                    # 如果映射键包含"*"，则替换为层索引
                    if "*" in mapped_key:
                        layer_index = name.split(key)[0].split(".")[-2]
                        mapped_key = mapped_key.replace("*", layer_index)
                    # 根据名称确定权重类型
                    if "weight_g" in name:
                        weight_type = "weight_g"
                    elif "weight_v" in name:
                        weight_type = "weight_v"
                    elif "bias" in name and "relative_attention_bias" not in name:
                        weight_type = "bias"
                    elif "weight" in name:
                        weight_type = "weight"
                    else:
                        weight_type = None

                    # 递归设置Hugging Face模型中的权重
                    set_recursively(hf_model, mapped_key, value, name, weight_type)
                continue
        # 如果未使用此权重，将其名称添加到未使用的权重列表中
        if not is_used:
            unused_weights.append(name)

    # 输出警告，显示未使用的权重列表
    logger.warning(f"Unused weights: {unused_weights}")


# 加载卷积层权重的函数
def load_conv_layer(full_name, value, feature_extractor, unused_weights, use_group_norm):
    # 获取卷积层的名称
    name = full_name.split("conv_layers.")[-1]
    # 分割名称为列表
    items = name.split(".")
    # 提取层索引和类型索引
    layer_id = int(items[0])
    type_id = int(items[1])

    # 如果类型索引为0
    if type_id == 0:
        # 如果名称中包含"bias"
        if "bias" in name:
            # 断言当前值的形状与特征提取器中对应卷积层的偏置数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.bias.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.bias.data.shape} was found."
            )
            # 将值赋给特征提取器中对应卷积层的偏置数据
            feature_extractor.conv_layers[layer_id].conv.bias.data = value
            # 记录日志，显示卷积层的初始化
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
        # 如果名称中包含"weight"
        elif "weight" in name:
            # 断言当前值的形状与特征提取器中对应卷积层的权重数据形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].conv.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor.conv_layers[layer_id].conv.weight.data.shape} was found."
            )
            # 将值赋给特征提取器中对应卷积层的权重数据
            feature_extractor.conv_layers[layer_id].conv.weight.data = value
            # 记录日志，显示卷积层的初始化
            logger.info(f"Feat extract conv layer {layer_id} was initialized from {full_name}.")
    # 如果 type_id 为 2 并且不使用组归一化，或者 type_id 为 2、layer_id 为 0 并且使用组归一化，则执行以下操作
    elif (type_id == 2 and not use_group_norm) or (type_id == 2 and layer_id == 0 and use_group_norm):
        # 如果名称中包含 "bias"
        if "bias" in name:
            # 断言值的形状与特征提取器中对应卷积层的层归一化偏置数据的形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.bias.data.shape, (
                f"{full_name} has size {value.shape}, but {feature_extractor[layer_id].layer_norm.bias.data.shape} was"
                " found."
            )
            # 将值赋给特征提取器中对应卷积层的层归一化偏置数据
            feature_extractor.conv_layers[layer_id].layer_norm.bias.data = value
            # 记录日志，指示层归一化权重已从指定名称的初始化值初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
        # 如果名称中包含 "weight"
        elif "weight" in name:
            # 断言值的形状与特征提取器中对应卷积层的层归一化权重数据的形状相同
            assert value.shape == feature_extractor.conv_layers[layer_id].layer_norm.weight.data.shape, (
                f"{full_name} has size {value.shape}, but"
                f" {feature_extractor[layer_id].layer_norm.weight.data.shape} was found."
            )
            # 将值赋给特征提取器中对应卷积层的层归一化权重数据
            feature_extractor.conv_layers[layer_id].layer_norm.weight.data = value
            # 记录日志，指示层归一化权重已从指定名称的初始化值初始化
            logger.info(f"Feat extract layer norm weight of layer {layer_id} was initialized from {full_name}.")
    # 如果不满足上述条件，则将未使用的权重名称添加到未使用的权重列表中
    else:
        unused_weights.append(full_name)
@torch.no_grad()
def convert_wavlm_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None):
    # 使用torch.no_grad()装饰器，禁用梯度计算，以节省内存和加快推理速度
    # 载入预训练的检查点文件
    checkpoint = torch.load(checkpoint_path)
    # 使用检查点中的配置信息创建WavLMConfigOrig对象
    cfg = WavLMConfigOrig(checkpoint["cfg"])
    # 使用WavLMOrig类和检查点中的模型状态字典创建模型
    model = WavLMOrig(cfg)
    model.load_state_dict(checkpoint["model"])
    # 将模型设置为评估模式，通常用于推理阶段
    model.eval()

    # 如果提供了配置文件路径，则使用预训练模型的配置创建WavLMConfig对象
    if config_path is not None:
        config = WavLMConfig.from_pretrained(config_path)
    else:
        # 否则创建一个空的配置对象
        config = WavLMConfig()

    # 创建一个新的WavLMModel对象
    hf_wavlm = WavLMModel(config)

    # 递归地加载模型的权重到hf_wavlm中
    recursively_load_weights(model, hf_wavlm)

    # 将转换后的PyTorch模型保存到指定的文件夹路径中
    hf_wavlm.save_pretrained(pytorch_dump_folder_path)


if __name__ == "__main__":
    # 创建参数解析器
    parser = argparse.ArgumentParser()
    # 添加命令行参数，用于指定输出的PyTorch模型路径
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
    # 添加命令行参数，用于指定fairseq检查点文件的路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
    # 添加命令行参数，用于指定要转换模型的hf配置文件的路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
    # 解析命令行参数
    args = parser.parse_args()
    # 调用convert_wavlm_checkpoint函数，开始模型转换过程
    convert_wavlm_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path)

`.\models\wavlm\convert_wavlm_original_s3prl_checkpoint_to_pytorch.py`

# 导入需要的库和模块
import argparse  # 导入参数解析模块

import torch  # 导入 PyTorch 库

# 从 transformers 库中导入需要的类和函数
from transformers import (
    Wav2Vec2FeatureExtractor,  # 导入音频特征提取器类
    WavLMConfig,  # 导入 WavLM 模型的配置类
    WavLMForAudioFrameClassification,  # 导入用于音频帧分类的 WavLM 模型类
    WavLMForSequenceClassification,  # 导入用于序列分类的 WavLM 模型类
    WavLMForXVector,  # 导入用于 XVector 的 WavLM 模型类
    logging,  # 导入日志记录模块
)

# 设置日志记录的详细程度为信息级别
logging.set_verbosity_info()
# 获取当前文件的日志记录器对象
logger = logging.get_logger(__name__)


# 定义转换序列分类模型的函数
def convert_classification(base_model_name, hf_config, downstream_dict):
    # 从预训练模型名称和配置创建 WavLM 序列分类模型
    model = WavLMForSequenceClassification.from_pretrained(base_model_name, config=hf_config)
    # 设置模型投影器的权重为下游任务的投影器权重
    model.projector.weight.data = downstream_dict["projector.weight"]
    # 设置模型投影器的偏置为下游任务的投影器偏置
    model.projector.bias.data = downstream_dict["projector.bias"]
    # 设置模型分类器的权重为下游任务的线性层权重
    model.classifier.weight.data = downstream_dict["model.post_net.linear.weight"]
    # 设置模型分类器的偏置为下游任务的线性层偏置
    model.classifier.bias.data = downstream_dict["model.post_net.linear.bias"]
    return model  # 返回转换后的模型


# 定义转换音频帧分类模型的函数
def convert_diarization(base_model_name, hf_config, downstream_dict):
    # 从预训练模型名称和配置创建 WavLM 音频帧分类模型
    model = WavLMForAudioFrameClassification.from_pretrained(base_model_name, config=hf_config)
    # 设置模型分类器的权重为下游任务的线性层权重
    model.classifier.weight.data = downstream_dict["model.linear.weight"]
    # 设置模型分类器的偏置为下游任务的线性层偏置
    model.classifier.bias.data = downstream_dict["model.linear.bias"]
    return model  # 返回转换后的模型


# 定义转换 XVector 模型的函数
def convert_xvector(base_model_name, hf_config, downstream_dict):
    # 从预训练模型名称和配置创建 WavLM XVector 模型
    model = WavLMForXVector.from_pretrained(base_model_name, config=hf_config)
    # 设置模型投影器的权重为下游任务的投影器权重
    model.projector.weight.data = downstream_dict["connector.weight"]
    # 设置模型投影器的偏置为下游任务的投影器偏置
    model.projector.bias.data = downstream_dict["connector.bias"]
    # 遍历和设置每个 TDNN 层的卷积核权重和偏置
    for i, kernel_size in enumerate(hf_config.tdnn_kernel):
        model.tdnn[i].kernel.weight.data = downstream_dict[
            f"model.framelevel_feature_extractor.module.{i}.kernel.weight"
        ]
        model.tdnn[i].kernel.bias.data = downstream_dict[f"model.framelevel_feature_extractor.module.{i}.kernel.bias"]

    # 设置模型语音层特征提取器的第一个线性层权重和偏置
    model.feature_extractor.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.weight"]
    model.feature_extractor.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear1.bias"]
    # 设置模型语音层特征提取器的第二个线性层权重和偏置
    model.classifier.weight.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.weight"]
    model.classifier.bias.data = downstream_dict["model.utterancelevel_feature_extractor.linear2.bias"]
    # 设置模型目标函数的权重
    model.objective.weight.data = downstream_dict["objective.W"]
    return model  # 返回转换后的模型


# 定义用于转换 S3PRL 检查点的函数，这个函数没有实现，只有文档字符串
@torch.no_grad()
def convert_s3prl_checkpoint(base_model_name, config_path, checkpoint_path, model_dump_path):
    """
    此函数用于从 S3PRL 模型检查点转换模型到其他格式，但是这里没有具体的实现代码。
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    # 加载模型检查点，指定在CPU上进行加载
    checkpoint = torch.load(checkpoint_path, map_location="cpu")

    # 从检查点中提取下游任务相关的信息
    downstream_dict = checkpoint["Downstream"]

    # 从预训练配置文件中加载 Wav2Vec2 模型的配置
    hf_config = WavLMConfig.from_pretrained(config_path)

    # 从预训练模型中加载 Wav2Vec2 特征提取器
    hf_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
        base_model_name, return_attention_mask=True, do_normalize=False
    )

    # 获取模型架构名称
    arch = hf_config.architectures[0]
    
    # 根据模型架构名称选择合适的转换函数转换模型
    if arch.endswith("ForSequenceClassification"):
        hf_model = convert_classification(base_model_name, hf_config, downstream_dict)
    elif arch.endswith("ForAudioFrameClassification"):
        hf_model = convert_diarization(base_model_name, hf_config, downstream_dict)
    elif arch.endswith("ForXVector"):
        hf_model = convert_xvector(base_model_name, hf_config, downstream_dict)
    else:
        # 抛出异常，表示不支持当前模型架构的权重转换
        raise NotImplementedError(f"S3PRL weights conversion is not supported for {arch}")

    # 如果配置要求使用加权层求和，加载模型的加权层参数
    if hf_config.use_weighted_layer_sum:
        hf_model.layer_weights.data = checkpoint["Featurizer"]["weights"]

    # 将特征提取器保存到指定路径
    hf_feature_extractor.save_pretrained(model_dump_path)
    
    # 将转换后的模型保存到指定路径
    hf_model.save_pretrained(model_dump_path)
# 如果当前脚本作为主程序运行（而不是被导入为模块），则执行以下代码块
if __name__ == "__main__":
    # 创建参数解析器对象
    parser = argparse.ArgumentParser()
    
    # 添加命令行参数，用于指定huggingface预训练基础模型的名称
    parser.add_argument(
        "--base_model_name", default=None, type=str, help="Name of the huggingface pretrained base model."
    )
    
    # 添加命令行参数，用于指定huggingface分类器配置文件的路径
    parser.add_argument("--config_path", default=None, type=str, help="Path to the huggingface classifier config.")
    
    # 添加命令行参数，用于指定s3prl检查点文件的路径
    parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to the s3prl checkpoint.")
    
    # 添加命令行参数，用于指定最终转换模型的保存路径
    parser.add_argument("--model_dump_path", default=None, type=str, help="Path to the final converted model.")
    
    # 解析命令行参数，并将它们保存在args对象中
    args = parser.parse_args()
    
    # 调用函数，将指定的参数传递给函数
    convert_s3prl_checkpoint(args.base_model_name, args.config_path, args.checkpoint_path, args.model_dump_path)

`.\models\wavlm\modeling_wavlm.py`

# coding=utf-8
# 版权声明
# 版权所有（c）2021年 Fairseq作者，Microsoft Research和HuggingFace Inc.团队。保留所有权利。
#
# 根据Apache许可证2.0版（“许可证”）许可;
# 除非符合许可证，否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则依据“许可证”分发的软件
# 以“原样”分发，无任何明示或暗示的担保或条件。
# 有关更多详细信息，请参阅许可证。

""" PyTorch WavLM模型。"""

import math
import warnings
from typing import Optional, Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

from ...activations import ACT2FN
from ...integrations.deepspeed import is_deepspeed_zero3_enabled
from ...modeling_outputs import (
    BaseModelOutput,
    CausalLMOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
    Wav2Vec2BaseModelOutput,
    XVectorOutput,
)
from ...modeling_utils import PreTrainedModel
from ...utils import (
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_peft_available,
    logging,
)
from .configuration_wavlm import WavLMConfig

logger = logging.get_logger(__name__)

# 隐藏状态的起始位置
_HIDDEN_STATES_START_POSITION = 2

# 用于文档的配置信息
_CONFIG_FOR_DOC = "WavLMConfig"

# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "patrickvonplaten/wavlm-libri-clean-100h-base-plus"
_EXPECTED_OUTPUT_SHAPE = [1, 292, 768]

# CTC（Connectionist Temporal Classification）的预期输出文本和损失值
_CTC_EXPECTED_OUTPUT = "'mister quilter is the aposle of the middle classes and we are glad to welcome his gospel'"
_CTC_EXPECTED_LOSS = 12.51

# Frame类的检查点和预期输出
_FRAME_CLASS_CHECKPOINT = "microsoft/wavlm-base-plus-sd"
_FRAME_EXPECTED_OUTPUT = [0, 0]

# Speaker Verification（说话者验证）的检查点和预期输出
_XVECTOR_CHECKPOINT = "microsoft/wavlm-base-plus-sv"
_XVECTOR_EXPECTED_OUTPUT = 0.97

# WavLM预训练模型的存档列表
WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/wavlm-base",
    "microsoft/wavlm-base-plus",
    "microsoft/wavlm-large",
    # 查看所有WavLM模型：https://huggingface.co/models?filter=wavlm
]

# 从transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices复制的函数
def _compute_mask_indices(
    shape: Tuple[int, int],
    mask_prob: float,
    mask_length: int,
    attention_mask: Optional[torch.LongTensor] = None,
    min_masks: int = 0,
) -> np.ndarray:
    """
    计算给定形状的随机掩码跨度。用于实现《SpecAugment: 一种用于ASR的简单数据增强方法》。
    请注意，此方法未经优化以在TPU上运行，应作为训练期间的预处理步骤在CPU上运行。
    """
    pass  # 此处为函数体的占位符，未实际执行任何操作
    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    """
    # 解包 shape 参数
    batch_size, sequence_length = shape

    # 检查 mask_length 是否合法
    if mask_length < 1:
        raise ValueError("`mask_length` has to be bigger than 0.")

    # 检查 mask_length 是否小于 sequence_length
    if mask_length > sequence_length:
        raise ValueError(
            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
            f" and `sequence_length`: {sequence_length}`"
        )

    # epsilon 用于概率性取整
    epsilon = np.random.rand(1).item()

    def compute_num_masked_span(input_length):
        """Given input length, compute how many spans should be masked"""
        # 计算应该遮罩的 span 数量
        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
        # 确保遮罩的 span 数量不小于 min_masks
        num_masked_span = max(num_masked_span, min_masks)

        # 确保遮罩的 span 数量不超过 sequence_length
        if num_masked_span * mask_length > sequence_length:
            num_masked_span = sequence_length // mask_length

        # 确保遮罩的 span 数量不超过 input_length - (mask_length - 1)
        if input_length - (mask_length - 1) < num_masked_span:
            num_masked_span = max(input_length - (mask_length - 1), 0)

        return num_masked_span

    # 计算每个 batch 中的实际长度列表
    input_lengths = (
        attention_mask.sum(-1).detach().tolist()
        if attention_mask is not None
        else [sequence_length for _ in range(batch_size)]
    )

    # 创建一个全零的布尔掩码数组
    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
    spec_aug_mask_idxs = []

    # 计算最大允许的遮罩 span 数量
    max_num_masked_span = compute_num_masked_span(sequence_length)

    # 如果最大允许的遮罩 span 数量为 0，则直接返回全零的掩码数组
    if max_num_masked_span == 0:
        return spec_aug_mask
    # 遍历输入长度列表中的每个长度
    for input_length in input_lengths:
        # 计算当前输入长度下要生成的被遮盖区间数量
        num_masked_span = compute_num_masked_span(input_length)

        # 随机选择要遮盖的起始索引
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )

        # 选择第一个被抽样的索引作为填充向量的虚拟索引，确保所有批次具有相同的维度
        # 这是由于概率舍入而产生的
        if len(spec_aug_mask_idx) == 0:
            # 如果没有选择到任何索引，说明输入长度严格小于序列长度，此时最后一个标记应该是填充标记
            # 我们可以使用它作为虚拟遮盖标识符的索引
            dummy_mask_idx = sequence_length - 1
        else:
            dummy_mask_idx = spec_aug_mask_idx[0]

        # 将虚拟索引添加到遮盖索引数组中，以确保所有批次的数组长度相同
        spec_aug_mask_idx = np.concatenate(
            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
        )
        spec_aug_mask_idxs.append(spec_aug_mask_idx)

    # 将遮盖索引列表转换为 NumPy 数组
    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)

    # 将遮盖索引扩展为遮盖区间
    spec_aug_mask_idxs = np.broadcast_to(
        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
    )
    # 将数组形状重新调整为批次大小乘以最大遮盖区间数乘以遮盖长度
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)

    # 对起始索引添加偏移量，以便索引现在表示一个区间
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets

    # 确保索引不会超过序列长度
    if spec_aug_mask_idxs.max() > sequence_length - 1:
        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1

    # 使用散点方法将索引应用到遮盖向量中
    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)

    # 返回生成的遮盖向量
    return spec_aug_mask
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->WavLM
class WavLMNoLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 根据给定层编号确定输入和输出的卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层，根据配置设定卷积核大小、步长和偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 使用预定义的激活函数对卷积层的输出进行激活
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将输入的隐藏状态应用到卷积层上
        hidden_states = self.conv(hidden_states)
        # 应用激活函数到卷积层的输出
        hidden_states = self.activation(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->WavLM
class WavLMLayerNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 根据给定层编号确定输入和输出的卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层，根据配置设定卷积核大小、步长和偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 创建一个 LayerNorm 层，对卷积层的输出进行归一化处理
        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
        # 使用预定义的激活函数对卷积层的输出进行激活
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将输入的隐藏状态应用到卷积层上
        hidden_states = self.conv(hidden_states)

        # 将卷积层输出的维度转置，以便对 LayerNorm 进行处理
        hidden_states = hidden_states.transpose(-2, -1)
        # 应用 LayerNorm 对卷积层输出进行归一化处理
        hidden_states = self.layer_norm(hidden_states)
        # 再次转置回原始维度，并将处理后的结果返回
        hidden_states = hidden_states.transpose(-2, -1)

        # 应用激活函数到处理后的卷积层输出
        hidden_states = self.activation(hidden_states)
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->WavLM
class WavLMGroupNormConvLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 根据给定层编号确定输入和输出的卷积维度
        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
        self.out_conv_dim = config.conv_dim[layer_id]

        # 创建一个一维卷积层，根据配置设定卷积核大小、步长和偏置
        self.conv = nn.Conv1d(
            self.in_conv_dim,
            self.out_conv_dim,
            kernel_size=config.conv_kernel[layer_id],
            stride=config.conv_stride[layer_id],
            bias=config.conv_bias,
        )
        # 使用预定义的激活函数对卷积层的输出进行激活
        self.activation = ACT2FN[config.feat_extract_activation]

        # 创建一个 GroupNorm 层，对卷积层的输出进行分组归一化处理
        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)

    def forward(self, hidden_states):
        # 将输入的隐藏状态应用到卷积层上
        hidden_states = self.conv(hidden_states)
        # 应用 GroupNorm 对卷积层输出进行分组归一化处理
        hidden_states = self.layer_norm(hidden_states)
        # 应用激活函数到处理后的卷积层输出
        hidden_states = self.activation(hidden_states)
        return hidden_states
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding 复制而来，改名为 WavLMPositionalConvEmbedding
class WavLMPositionalConvEmbedding(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个 1D 卷积层，用于位置编码
        self.conv = nn.Conv1d(
            config.hidden_size,
            config.hidden_size,
            kernel_size=config.num_conv_pos_embeddings,
            padding=config.num_conv_pos_embeddings // 2,
            groups=config.num_conv_pos_embedding_groups,
        )

        # 设置权重归一化函数
        weight_norm = nn.utils.weight_norm
        if hasattr(nn.utils.parametrizations, "weight_norm"):
            weight_norm = nn.utils.parametrizations.weight_norm

        # 如果使用了 deepspeed 的 zero3 加速，对卷积层进行特殊处理
        if is_deepspeed_zero3_enabled():
            import deepspeed

            # 在 zero3 加速模式下，使用 GatheredParameters 对象管理权重
            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
                self.conv = weight_norm(self.conv, name="weight", dim=2)
            # 注册外部参数以进行 zero3 加速管理
            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
        else:
            # 普通情况下，对卷积层应用权重归一化
            self.conv = weight_norm(self.conv, name="weight", dim=2)

        # 创建一个用于同步填充的对象
        self.padding = WavLMSamePadLayer(config.num_conv_pos_embeddings)
        # 选择激活函数，根据配置中的 feat_extract_activation 选择对应的激活函数
        self.activation = ACT2FN[config.feat_extract_activation]

    def forward(self, hidden_states):
        # 将输入的 hidden_states 调整维度，转换为 Conv1d 的输入格式
        hidden_states = hidden_states.transpose(1, 2)

        # 应用卷积操作
        hidden_states = self.conv(hidden_states)
        # 对卷积结果进行同步填充
        hidden_states = self.padding(hidden_states)
        # 应用激活函数
        hidden_states = self.activation(hidden_states)

        # 调整输出维度，返回结果
        hidden_states = hidden_states.transpose(1, 2)
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer 复制而来，改名为 WavLMSamePadLayer
class WavLMSamePadLayer(nn.Module):
    def __init__(self, num_conv_pos_embeddings):
        super().__init__()
        # 根据 num_conv_pos_embeddings 的奇偶性确定需要移除的填充数
        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0

    def forward(self, hidden_states):
        # 如果需要移除填充，则按照设定的数量截取隐藏状态
        if self.num_pad_remove > 0:
            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
        return hidden_states


# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder 复制而来，改名为 WavLMFeatureEncoder
class WavLMFeatureEncoder(nn.Module):
    """Construct the features from raw audio waveform"""
    # 初始化方法，接受一个配置对象作为参数
    def __init__(self, config):
        # 调用父类（nn.Module）的初始化方法
        super().__init__()

        # 根据配置文件中的特征提取归一化方式选择不同的卷积层列表
        if config.feat_extract_norm == "group":
            # 如果归一化方式是"group"，则创建包含组归一化的第一个卷积层和其余的无归一化卷积层
            conv_layers = [WavLMGroupNormConvLayer(config, layer_id=0)] + [
                WavLMNoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
            ]
        elif config.feat_extract_norm == "layer":
            # 如果归一化方式是"layer"，则创建全部使用层归一化的卷积层列表
            conv_layers = [WavLMLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)]
        else:
            # 如果归一化方式既不是"group"也不是"layer"，则抛出值错误
            raise ValueError(
                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
            )

        # 将卷积层列表转换为 nn.ModuleList 类型，使其成为 nn.Module 的一部分
        self.conv_layers = nn.ModuleList(conv_layers)

        # 设置梯度检查点为关闭状态
        self.gradient_checkpointing = False

        # 设置需要梯度计算为 True
        self._requires_grad = True

    # 冻结模型参数的方法
    def _freeze_parameters(self):
        # 遍历所有模型参数，并设置其 requires_grad 属性为 False
        for param in self.parameters():
            param.requires_grad = False
        # 将模型的 _requires_grad 属性设置为 False，表示模型参数已冻结
        self._requires_grad = False

    # 前向传播方法，接受输入值 input_values 作为参数
    def forward(self, input_values):
        # 将输入值增加一个维度，用于后续卷积操作
        hidden_states = input_values[:, None]

        # 如果模型需要梯度计算并且处于训练模式，则设置 hidden_states 的 requires_grad 为 True
        if self._requires_grad and self.training:
            hidden_states.requires_grad = True

        # 遍历所有卷积层，并应用它们到 hidden_states 上
        for conv_layer in self.conv_layers:
            # 如果模型需要梯度计算并且开启了梯度检查点并且处于训练模式，则使用梯度检查点函数处理 hidden_states
            if self._requires_grad and self.gradient_checkpointing and self.training:
                hidden_states = self._gradient_checkpointing_func(
                    conv_layer.__call__,
                    hidden_states,
                )
            else:
                # 否则直接调用当前卷积层处理 hidden_states
                hidden_states = conv_layer(hidden_states)

        # 返回最终的 hidden_states，经过所有卷积层处理后的结果
        return hidden_states
class WavLMFeatureExtractor(WavLMFeatureEncoder):
    # 继承自WavLMFeatureEncoder的WavLMFeatureExtractor类的初始化方法
    def __init__(self, config):
        # 调用父类WavLMFeatureEncoder的初始化方法
        super().__init__(config)
        # 发出警告，提示该类已被弃用，并建议使用Transformers v5中的基类
        warnings.warn(
            f"The class `{self.__class__.__name__}` has been depreciated "
            "and will be removed in Transformers v5. "
            f"Use `{self.__class__.__bases__[0].__name__}` instead.",
            FutureWarning,
        )


# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection复制并修改为WavLM
class WavLMFeatureProjection(nn.Module):
    # WavLMFeatureProjection类，继承自nn.Module
    def __init__(self, config):
        # 初始化方法
        super().__init__()
        # 使用LayerNorm进行层归一化，eps参数为配置文件中的layer_norm_eps
        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
        # 使用Linear进行特征投影，将卷积维度投影到隐藏大小，config.hidden_size为配置文件中的隐藏大小
        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
        # 使用Dropout进行特征投影的dropout，概率为config.feat_proj_dropout
        self.dropout = nn.Dropout(config.feat_proj_dropout)

    def forward(self, hidden_states):
        # 执行前向传播
        # 对隐藏状态进行LayerNorm归一化处理
        norm_hidden_states = self.layer_norm(hidden_states)
        # 对归一化后的隐藏状态进行投影
        hidden_states = self.projection(norm_hidden_states)
        # 对投影后的结果应用Dropout
        hidden_states = self.dropout(hidden_states)
        return hidden_states, norm_hidden_states


class WavLMAttention(nn.Module):
    """基于'Attention Is All You Need'论文的多头注意力机制"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        num_buckets: int = 320,
        max_distance: int = 800,
        has_relative_position_bias: bool = True,
    ):
        # 初始化方法
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5

        # 线性变换层，用于计算Q、K、V和输出的线性投影
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)

        self.num_buckets = num_buckets
        self.max_distance = max_distance

        # GRU相对位置编码的常数项和线性变换
        self.gru_rel_pos_const = nn.Parameter(torch.ones(1, self.num_heads, 1, 1))
        self.gru_rel_pos_linear = nn.Linear(self.head_dim, 8)

        if has_relative_position_bias:
            # 如果启用相对位置偏置，则使用Embedding层
            self.rel_attn_embed = nn.Embedding(self.num_buckets, self.num_heads)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_bias: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        index=0,
        # 定义前向传播方法，接受隐藏状态、注意力掩码、位置偏置等参数
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        """Attention layer with relative attention"""
        # 获取输入张量的维度信息
        bsz, tgt_len, _ = hidden_states.size()

        # 如果位置偏置为None，则计算位置偏置
        if position_bias is None:
            # 计算位置偏置
            position_bias = self.compute_bias(tgt_len, tgt_len)
            # 扩展位置偏置以适应多头注意力的形状要求
            position_bias = (
                position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, tgt_len)
            )

        # 计算相对位置偏置:
        # 1) 重塑隐藏状态张量，以便将多头注意力的头部维度放在中间
        gated_hidden_states = hidden_states.view(hidden_states.shape[:-1] + (self.num_heads, -1))
        gated_hidden_states = gated_hidden_states.permute(0, 2, 1, 3)

        # 2) 投影隐藏状态以计算相对位置偏置
        relative_position_proj = self.gru_rel_pos_linear(gated_hidden_states)
        # 将投影后的张量重塑，并对最后一个维度求和
        relative_position_proj = relative_position_proj.view(gated_hidden_states.shape[:-1] + (2, 4)).sum(-1)

        # 3) 从投影后的隐藏状态计算位置偏置的门控值
        gate_a, gate_b = torch.sigmoid(relative_position_proj).chunk(2, dim=-1)
        gate_output = gate_a * (gate_b * self.gru_rel_pos_const - 1.0) + 2.0

        # 4) 将门控值应用于位置偏置，计算门控位置偏置
        gated_position_bias = gate_output.view(bsz * self.num_heads, -1, 1) * position_bias
        gated_position_bias = gated_position_bias.view((-1, tgt_len, tgt_len))

        # 调用多头自注意力函数进行注意力计算
        attn_output, attn_weights = self.torch_multi_head_self_attention(
            hidden_states, attention_mask, gated_position_bias, output_attentions
        )

        # 返回注意力计算结果、注意力权重和位置偏置
        return attn_output, attn_weights, position_bias

    def torch_multi_head_self_attention(
        self,
        hidden_states: torch.FloatTensor,
        attention_mask: Union[torch.LongTensor, torch.BoolTensor],
        gated_position_bias: torch.FloatTensor,
        output_attentions: bool,
    ) -> (torch.FloatTensor, torch.FloatTensor):
        """simple wrapper around torch's multi_head_attention_forward function"""
        # self-attention assumes q = k = v
        query = key = value = hidden_states.transpose(0, 1)
        # 根据注意力掩码创建键掩码，若没有注意力掩码则为None
        key_padding_mask = attention_mask.ne(1) if attention_mask is not None else None

        # disable bias and add_zero_attn
        bias_k = bias_v = None
        add_zero_attn = False

        # PyTorch 1.3.0 has F.multi_head_attention_forward defined
        # so no problem with backwards compatibility
        # 使用 F.multi_head_attention_forward 函数进行多头注意力计算
        attn_output, attn_weights = F.multi_head_attention_forward(
            query,
            key,
            value,
            self.embed_dim,
            self.num_heads,
            torch.empty([0]),
            # 将三个投影的偏置连接起来作为参数传入
            torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
            bias_k,
            bias_v,
            add_zero_attn,
            self.dropout,
            self.out_proj.weight,
            self.out_proj.bias,
            self.training,
            key_padding_mask,
            output_attentions,
            gated_position_bias,
            use_separate_proj_weight=True,
            q_proj_weight=self.q_proj.weight,
            k_proj_weight=self.k_proj.weight,
            v_proj_weight=self.v_proj.weight,
        )

        # [Seq_Len, Batch Size, ...] -> [Batch Size, Seq_Len, ...]
        # 调整注意力输出的维度顺序
        attn_output = attn_output.transpose(0, 1)

        if attn_weights is not None:
            # IMPORTANT: Attention weights are averaged weights
            # here which should not be the case. This is an open issue
            # on PyTorch: https://github.com/pytorch/pytorch/issues/32590
            # 对注意力权重进行处理，这里的平均权重处理可能不是理想的情况
            attn_weights = attn_weights[:, None].broadcast_to(
                attn_weights.shape[:1] + (self.num_heads,) + attn_weights.shape[1:]
            )

        return attn_output, attn_weights

    def compute_bias(self, query_length: int, key_length: int) -> torch.FloatTensor:
        # 生成相对位置编码
        context_position = torch.arange(query_length, dtype=torch.long)[:, None]
        memory_position = torch.arange(key_length, dtype=torch.long)[None, :]
        relative_position = memory_position - context_position
        # 使用 _relative_positions_bucket 方法将相对位置映射到桶中
        relative_position_bucket = self._relative_positions_bucket(relative_position)
        # 将映射后的相对位置桶转移到与相对位置嵌入张量相同的设备上
        relative_position_bucket = relative_position_bucket.to(self.rel_attn_embed.weight.device)
        # 获取相对位置嵌入的值并进行维度变换
        values = self.rel_attn_embed(relative_position_bucket)
        values = values.permute([2, 0, 1])
        return values
    # 定义一个方法，用于将相对位置转换成相对桶索引
    def _relative_positions_bucket(self, relative_positions: torch.FloatTensor) -> torch.FloatTensor:
        # 桶的数量，除以2后取整
        num_buckets = self.num_buckets // 2

        # 将相对位置是否大于0的结果转换成long类型，并乘以桶数量
        relative_buckets = (relative_positions > 0).to(torch.long) * num_buckets
        # 取相对位置的绝对值
        relative_positions = torch.abs(relative_positions)

        # 定义最大的精确桶数量
        max_exact = num_buckets // 2
        # 判断相对位置是否小于最大精确值
        is_small = relative_positions < max_exact

        # 如果相对位置较大，计算相对位置的大桶索引
        relative_positions_if_large = torch.log(relative_positions.float() / max_exact)
        relative_positions_if_large = relative_positions_if_large / math.log(self.max_distance / max_exact)
        relative_positions_if_large = relative_positions_if_large * (num_buckets - max_exact)
        relative_position_if_large = (max_exact + relative_positions_if_large).to(torch.long)
        relative_position_if_large = torch.min(
            relative_position_if_large, torch.full_like(relative_position_if_large, num_buckets - 1)
        )

        # 根据 is_small 条件选择相对位置或者大桶索引，加到 relative_buckets 中
        relative_buckets += torch.where(is_small, relative_positions, relative_position_if_large)
        # 返回相对桶索引
        return relative_buckets
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward复制而来，将Wav2Vec2替换为WavLM
class WavLMFeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.intermediate_dropout = nn.Dropout(config.activation_dropout)

        # 创建一个线性层，将输入大小为config.hidden_size映射到config.intermediate_size
        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
        
        # 根据配置选择激活函数，如果配置中指定的是字符串，使用ACT2FN字典中对应的函数，否则直接使用配置中的函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

        # 创建一个线性层，将config.intermediate_size映射回config.hidden_size
        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
        self.output_dropout = nn.Dropout(config.hidden_dropout)

    def forward(self, hidden_states):
        # 进行中间线性层的映射和激活函数处理
        hidden_states = self.intermediate_dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        hidden_states = self.intermediate_dropout(hidden_states)

        # 进行最终线性层的映射和dropout处理
        hidden_states = self.output_dense(hidden_states)
        hidden_states = self.output_dropout(hidden_states)
        return hidden_states


class WavLMEncoderLayer(nn.Module):
    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
        super().__init__()
        
        # 创建WavLMAttention层，初始化时设置了多种参数，包括注意力头数、dropout等
        self.attention = WavLMAttention(
            embed_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            dropout=config.attention_dropout,
            num_buckets=config.num_buckets,
            max_distance=config.max_bucket_distance,
            has_relative_position_bias=has_relative_position_bias,
        )
        
        # 创建dropout层
        self.dropout = nn.Dropout(config.hidden_dropout)
        
        # 创建LayerNorm层，用于规范化隐藏状态
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        
        # 创建WavLMFeedForward层，用于处理隐藏状态
        self.feed_forward = WavLMFeedForward(config)
        
        # 创建最终的LayerNorm层，用于规范化输出的隐藏状态
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False, index=0):
        # 将注意力层之前的隐藏状态保存下来，用于后续的残差连接
        attn_residual = hidden_states
        
        # 使用注意力层处理隐藏状态，获取处理后的隐藏状态、注意力权重以及位置偏置
        hidden_states, attn_weights, position_bias = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            position_bias=position_bias,
            output_attentions=output_attentions,
            index=index,
        )
        
        # 对处理后的隐藏状态应用dropout
        hidden_states = self.dropout(hidden_states)
        
        # 添加残差连接
        hidden_states = attn_residual + hidden_states
        
        # 对添加了注意力之后的隐藏状态进行LayerNorm规范化
        hidden_states = self.layer_norm(hidden_states)

        # 使用前馈网络处理规范化后的隐藏状态
        hidden_states = hidden_states + self.feed_forward(hidden_states)
        
        # 对前馈网络处理后的隐藏状态再次进行LayerNorm规范化
        hidden_states = self.final_layer_norm(hidden_states)

        # 准备输出结果，包括隐藏状态和位置偏置
        outputs = (hidden_states, position_bias)

        # 如果需要输出注意力权重，则在输出结果中添加注意力权重
        if output_attentions:
            outputs += (attn_weights,)

        return outputs


class WavLMEncoderLayerStableLayerNorm(nn.Module):
    # 初始化函数，用于创建一个新的WavLM模型层
    def __init__(self, config: WavLMConfig, has_relative_position_bias: bool = True):
        # 调用父类初始化函数
        super().__init__()
        # 初始化注意力层，传入配置参数
        self.attention = WavLMAttention(
            embed_dim=config.hidden_size,                      # 隐藏层大小
            num_heads=config.num_attention_heads,              # 注意力头数
            dropout=config.attention_dropout,                  # 注意力层的dropout率
            num_buckets=config.num_buckets,                    # 桶的数量（用于相对位置编码）
            max_distance=config.max_bucket_distance,           # 最大桶距离（用于相对位置编码）
            has_relative_position_bias=has_relative_position_bias,  # 是否包含相对位置偏置
        )
        # 初始化dropout层
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 初始化Layer Normalization层
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化前馈神经网络层
        self.feed_forward = WavLMFeedForward(config)
        # 初始化最终的Layer Normalization层
        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    # 前向传播函数，接受隐藏状态作为输入，执行模型的前向计算
    def forward(self, hidden_states, attention_mask=None, position_bias=None, output_attentions=False):
        # 保存注意力机制前的残差连接
        attn_residual = hidden_states
        # 应用Layer Normalization层
        hidden_states = self.layer_norm(hidden_states)
        # 调用注意力层的前向传播计算
        hidden_states, attn_weights, position_bias = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            position_bias=position_bias,
            output_attentions=output_attentions,
        )
        # 应用dropout层
        hidden_states = self.dropout(hidden_states)
        # 执行残差连接
        hidden_states = attn_residual + hidden_states
        # 应用最终的Layer Normalization层
        hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))

        # 输出包括最终隐藏状态和位置偏置
        outputs = (hidden_states, position_bias)

        # 如果需要输出注意力权重，添加到输出中
        if output_attentions:
            outputs += (attn_weights,)

        # 返回所有输出
        return outputs
# 定义一个用于处理音频数据的编码器模型，继承自 nn.Module 类
class WavLMEncoder(nn.Module):
    # 初始化方法，接收一个配置参数 config
    def __init__(self, config):
        # 调用父类 nn.Module 的初始化方法
        super().__init__()
        # 将配置参数保存到实例变量中
        self.config = config
        # 初始化位置卷积嵌入层对象，用于处理位置信息的嵌入
        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)
        # 初始化 LayerNorm 层，用于标准化隐藏状态向量
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 初始化 Dropout 层，用于在训练过程中进行随机失活
        self.dropout = nn.Dropout(config.hidden_dropout)
        # 使用 nn.ModuleList 初始化一个包含多个 WavLMEncoderLayer 的列表
        # 每个 WavLMEncoderLayer 对象都基于相同的 config 参数，并根据其在列表中的位置决定是否使用相对位置偏置
        self.layers = nn.ModuleList(
            [WavLMEncoderLayer(config, has_relative_position_bias=(i == 0)) for i in range(config.num_hidden_layers)]
        )
        # 初始化梯度检查点标记，默认为 False
        self.gradient_checkpointing = False

    # 前向传播方法，接收隐藏状态、注意力掩码等参数
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        ):
            all_hidden_states = () if output_hidden_states else None
            all_self_attentions = () if output_attentions else None
            
            # 如果存在 attention_mask，则将未填充的 token 对应的 hidden_states 置为 0
            if attention_mask is not None:
                hidden_states[~attention_mask] = 0.0
            
            # 计算位置嵌入并与 hidden_states 相加
            position_embeddings = self.pos_conv_embed(hidden_states)
            hidden_states = hidden_states + position_embeddings
            # Layer normalization
            hidden_states = self.layer_norm(hidden_states)
            # Dropout
            hidden_states = self.dropout(hidden_states)
            
            # 检查是否启用了 DeepSpeed Zero3
            deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
            position_bias = None
            
            # 遍历每个 Transformer 层
            for i, layer in enumerate(self.layers):
                if output_hidden_states:
                    all_hidden_states = all_hidden_states + (hidden_states,)
                
                # 添加 LayerDrop 功能，控制层的随机丢弃
                dropout_probability = torch.rand([])
                
                # 根据 LayerDrop 的概率决定是否跳过当前层
                skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
                if not skip_the_layer or deepspeed_zero3_is_enabled:
                    # 如果启用了梯度检查点且在训练阶段，则使用梯度检查点函数
                    if self.gradient_checkpointing and self.training:
                        layer_outputs = self._gradient_checkpointing_func(
                            layer.__call__,
                            hidden_states,
                            attention_mask,
                            position_bias,
                            output_attentions,
                        )
                    else:
                        # 否则直接调用 Transformer 层
                        layer_outputs = layer(
                            hidden_states,
                            attention_mask=attention_mask,
                            position_bias=position_bias,
                            output_attentions=output_attentions,
                            index=i,
                        )
    
                    # 更新 hidden_states 和 position_bias
                    hidden_states, position_bias = layer_outputs[:2]
    
                # 如果跳过了当前层，则设置 layer_outputs 为 None
                if skip_the_layer:
                    layer_outputs = (None, None)
    
                # 如果需要输出注意力矩阵，则将当前层的注意力矩阵添加到 all_self_attentions 中
                if output_attentions:
                    all_self_attentions = all_self_attentions + (layer_outputs[2],)
    
            # 如果需要输出隐藏状态，则将最终的 hidden_states 添加到 all_hidden_states 中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)
    
            # 根据 return_dict 的设置返回相应的结果
            if not return_dict:
                # 如果不需要返回字典形式的输出，则返回元组
                return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
            else:
                # 否则以 BaseModelOutput 形式返回结果
                return BaseModelOutput(
                    last_hidden_state=hidden_states,
                    hidden_states=all_hidden_states,
                    attentions=all_self_attentions,
                )
# 定义一个稳定的层归一化的编码器类，继承自 nn.Module
class WavLMEncoderStableLayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()  # 调用父类的初始化方法
        self.config = config  # 存储传入的配置信息

        # 初始化位置卷积嵌入层，使用给定的配置信息
        self.pos_conv_embed = WavLMPositionalConvEmbedding(config)

        # 初始化层归一化层，指定隐藏层大小和 epsilon 值
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

        # 初始化 dropout 层，设定丢弃概率
        self.dropout = nn.Dropout(config.hidden_dropout)

        # 使用列表推导式初始化编码器层列表，每层调用 WavLMEncoderLayerStableLayerNorm 类
        # 对于第一层（i == 0），设定相对位置偏置参数为 True
        self.layers = nn.ModuleList(
            [
                WavLMEncoderLayerStableLayerNorm(config, has_relative_position_bias=(i == 0))
                for i in range(config.num_hidden_layers)
            ]
        )

        # 初始化梯度检查点标志为 False
        self.gradient_checkpointing = False

    # 定义前向传播方法
    def forward(
        self,
        hidden_states,
        attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        # 参数列表包括隐藏状态、注意力掩码、是否输出注意力权重、是否输出隐藏状态、是否返回字典形式结果等
        all_hidden_states = () if output_hidden_states else None
        all_self_attentions = () if output_attentions else None

        if attention_mask is not None:
            # 确保填充的标记不参与注意力计算
            hidden_states[~attention_mask] = 0

        # 使用位置卷积嵌入层处理位置信息
        position_embeddings = self.pos_conv_embed(hidden_states)
        # 将位置嵌入的结果加到隐藏状态上
        hidden_states = hidden_states + position_embeddings
        # 对隐藏状态进行dropout
        hidden_states = self.dropout(hidden_states)

        # 检查是否启用了 DeepSpeed zero3
        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
        position_bias = None

        # 迭代处理每个层
        for i, layer in enumerate(self.layers):
            if output_hidden_states:
                # 如果需要输出隐藏状态，将当前隐藏状态添加到所有隐藏状态元组中
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 添加 LayerDrop（参见 https://arxiv.org/abs/1909.11556 进行描述）
            dropout_probability = torch.rand([])

            # 根据 LayerDrop 的概率决定是否跳过当前层
            skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
            if not skip_the_layer or deepspeed_zero3_is_enabled:
                # 在 DeepSpeed zero3 情况下，所有 GPU 必须同步运行
                # 如果启用了梯度检查点且处于训练阶段，使用梯度检查点函数处理当前层的调用
                if self.gradient_checkpointing and self.training:
                    layer_outputs = self._gradient_checkpointing_func(
                        layer.__call__,
                        hidden_states,
                        attention_mask,
                        position_bias,
                        output_attentions,
                    )
                else:
                    # 否则直接调用当前层处理隐藏状态
                    layer_outputs = layer(
                        hidden_states,
                        attention_mask=attention_mask,
                        output_attentions=output_attentions,
                        position_bias=position_bias,
                    )
                # 更新隐藏状态和位置偏置
                hidden_states, position_bias = layer_outputs[:2]

            # 如果跳过当前层，设置层输出为 None
            if skip_the_layer:
                layer_outputs = (None, None)

            # 如果需要输出自注意力权重，将当前层的自注意力权重添加到所有自注意力元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[2],)

        # 对最终的隐藏状态进行 LayerNorm 处理
        hidden_states = self.layer_norm(hidden_states)

        # 如果需要输出隐藏状态，将最终的隐藏状态添加到所有隐藏状态元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，则根据需求返回相应的元组
        if not return_dict:
            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
        # 返回以 BaseModelOutput 形式封装的结果
        return BaseModelOutput(
            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
        )
    """
    使用 Gumbel softmax 进行向量量化。参见[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf)获取更多信息。
    """

    def __init__(self, config):
        super().__init__()
        self.num_groups = config.num_codevector_groups  # 设置编码向量组数
        self.num_vars = config.num_codevectors_per_group  # 每组编码向量的数量

        if config.codevector_dim % self.num_groups != 0:
            raise ValueError(
                f"`config.codevector_dim {config.codevector_dim} must be divisible"
                f" by `config.num_codevector_groups` {self.num_groups} "
                "for concatenation."
            )

        # 存储码本变量（码字）
        self.codevectors = nn.Parameter(
            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
        )
        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)  # 权重投影层

        # 可以在训练中进行衰减
        self.temperature = 2

    @staticmethod
    def _compute_perplexity(probs):
        """
        计算困惑度函数。
        Args:
            probs (torch.Tensor): 概率分布张量

        Returns:
            torch.Tensor: 计算得到的困惑度值
        """
        marginal_probs = probs.mean(dim=0)  # 计算边际概率
        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()  # 计算困惑度
        return perplexity
    def forward(self, hidden_states):
        # 获取输入张量的批大小、序列长度和隐藏单元大小
        batch_size, sequence_length, hidden_size = hidden_states.shape

        # 将隐藏状态投影到代码向量维度
        hidden_states = self.weight_proj(hidden_states)
        # 将张量形状重新视图为(batch_size * sequence_length * num_groups, -1)
        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)

        if self.training:
            # 使用Gumbel Softmax采样代码向量的概率，以可区分的方式
            codevector_probs = nn.functional.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True)
            codevector_probs = codevector_probs.type_as(hidden_states)

            # 计算困惑度
            codevector_soft_dist = torch.softmax(
                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
            )
            perplexity = self._compute_perplexity(codevector_soft_dist)
        else:
            # 在非可区分的方式下取argmax
            # 计算硬代码向量分布（one hot）
            codevector_idx = hidden_states.argmax(dim=-1)
            codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
                -1, codevector_idx.view(-1, 1), 1.0
            )
            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)

            # 计算困惑度
            perplexity = self._compute_perplexity(codevector_probs)

        # 将codevector_probs形状重新视图为(batch_size * sequence_length, -1)
        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
        # 使用概率检索代码向量
        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)

        # 返回最终的codevectors和困惑度
        return codevectors, perplexity
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->WavLM
class WavLMAdapter(nn.Module):
    def __init__(self, config):
        super().__init__()

        # 如果输出的隐藏层大小与配置中的隐藏层大小不同，可能需要进行降维投影
        if config.output_hidden_size != config.hidden_size:
            # 创建一个线性投影层，将隐藏状态大小从隐藏层大小投影到输出隐藏层大小
            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
            # 创建一个LayerNorm层，用于投影后的隐藏状态的归一化
            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
        else:
            self.proj = self.proj_layer_norm = None

        # 创建一系列适配器层，并存储在模块列表中
        self.layers = nn.ModuleList(WavLMAdapterLayer(config) for _ in range(config.num_adapter_layers))
        # 设置层丢弃率
        self.layerdrop = config.layerdrop

    def forward(self, hidden_states):
        # 如果存在投影层和LayerNorm层，则对隐藏状态进行投影
        if self.proj is not None and self.proj_layer_norm is not None:
            hidden_states = self.proj(hidden_states)
            hidden_states = self.proj_layer_norm(hidden_states)

        # 转置隐藏状态的维度，将第1和第2维互换位置
        hidden_states = hidden_states.transpose(1, 2)

        # 对每个适配器层进行迭代计算
        for layer in self.layers:
            # 随机生成一个丢弃概率
            layerdrop_prob = np.random.random()
            # 如果处于评估模式或者随机生成的概率大于层丢弃率，则应用该适配器层
            if not self.training or (layerdrop_prob > self.layerdrop):
                hidden_states = layer(hidden_states)

        # 再次转置隐藏状态的维度，将第1和第2维互换位置
        hidden_states = hidden_states.transpose(1, 2)
        # 返回最终的隐藏状态
        return hidden_states


# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->WavLM
class WavLMAdapterLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建一个一维卷积层，用于适配器
        self.conv = nn.Conv1d(
            config.output_hidden_size,        # 输入通道数为输出隐藏层大小
            2 * config.output_hidden_size,    # 输出通道数为2倍的输出隐藏层大小
            config.adapter_kernel_size,       # 卷积核大小由配置定义
            stride=config.adapter_stride,     # 卷积步长由配置定义
            padding=1,                        # 填充为1
        )

    def forward(self, hidden_states):
        # 将隐藏状态输入卷积层进行卷积操作
        hidden_states = self.conv(hidden_states)
        # 使用门控线性单元(Gated Linear Unit, GLU)激活函数进行非线性变换
        hidden_states = nn.functional.glu(hidden_states, dim=1)

        # 返回经过卷积和GLU激活函数处理后的隐藏状态
        return hidden_states


class WavLMPreTrainedModel(PreTrainedModel):
    """
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    """

    # 配置类为WavLMConfig
    config_class = WavLMConfig
    # 基础模型前缀为"wavlm"
    base_model_prefix = "wavlm"
    # 主输入名称为"input_values"
    main_input_name = "input_values"
    # 支持梯度检查点
    supports_gradient_checkpointing = True
    def _init_weights(self, module):
        """Initialize the weights"""
        # 如果模块是 WavLMGumbelVectorQuantizer 类型，使用特殊的初始化方法
        if isinstance(module, WavLMGumbelVectorQuantizer):
            # 初始化权重矩阵的权重数据为标准正态分布
            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
            # 将偏置数据初始化为零
            module.weight_proj.bias.data.zero_()
            # 使用均匀分布初始化编码向量
            nn.init.uniform_(module.codevectors)
        # 如果模块是 WavLMPositionalConvEmbedding 类型，使用特定的正态分布初始化
        elif isinstance(module, WavLMPositionalConvEmbedding):
            # 使用正态分布初始化卷积核权重数据
            nn.init.normal_(
                module.conv.weight,
                mean=0,
                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
            )
            # 将卷积层的偏置初始化为常数0
            nn.init.constant_(module.conv.bias, 0)
        # 如果模块是 WavLMFeatureProjection 类型，使用均匀分布初始化投影权重和偏置
        elif isinstance(module, WavLMFeatureProjection):
            # 计算均匀分布的上下限
            k = math.sqrt(1 / module.projection.in_features)
            # 使用均匀分布初始化投影层的权重
            nn.init.uniform_(module.projection.weight, a=-k, b=k)
            # 使用均匀分布初始化投影层的偏置
            nn.init.uniform_(module.projection.bias, a=-k, b=k)
        # 如果模块是 nn.Linear 类型，使用正态分布初始化权重，同时将偏置初始化为零
        elif isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)

            if module.bias is not None:
                module.bias.data.zero_()
        # 如果模块是 nn.LayerNorm 或 nn.GroupNorm 类型，将偏置初始化为零，权重初始化为1
        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        # 如果模块是 nn.Conv1d 类型，使用 Kaiming 正态分布初始化权重
        elif isinstance(module, nn.Conv1d):
            nn.init.kaiming_normal_(module.weight)

            if module.bias is not None:
                # 计算均匀分布的上下限
                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
                # 使用均匀分布初始化卷积层的偏置
                nn.init.uniform_(module.bias, a=-k, b=k)

    def _get_feat_extract_output_lengths(
        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
    ):
        """
        Computes the output length of the convolutional layers
        """
        # 如果未指定 add_adapter，则使用配置中的默认值
        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter

        def _conv_out_length(input_length, kernel_size, stride):
            # 根据 PyTorch 文档计算一维卷积层的输出长度公式
            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1

        # 根据配置中的卷积核大小和步长计算每个卷积层的输出长度
        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)

        # 如果需要添加适配器，根据配置中的适配器层数计算适配器的输出长度
        if add_adapter:
            for _ in range(self.config.num_adapter_layers):
                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)

        return input_lengths

    def _get_feature_vector_attention_mask(
        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
    ):
        """Compute attention mask for feature vectors"""
        # 此方法计算用于特征向量的注意力掩码，输入参数包括特征向量的长度和注意力掩码张量
        # 计算非填充部分的长度，相当于 attention_mask.sum(-1)，但不进行原地操作以便在推断模式下运行
        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]

        # 根据非填充长度获取特征提取器的输出长度，可以选择添加适配器
        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
        output_lengths = output_lengths.to(torch.long)

        # 获取批次大小
        batch_size = attention_mask.shape[0]

        # 创建一个全零的注意力掩码张量，形状为 (batch_size, feature_vector_length)，与输入的注意力掩码相同的数据类型和设备
        attention_mask = torch.zeros(
            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
        )

        # 设置输出长度前的所有位置为 1，确保这些位置上的值被完全注意到
        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1

        # 将注意力掩码进行翻转，累积求和，并再次翻转，最终转换为布尔类型
        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()

        # 返回最终的注意力掩码张量
        return attention_mask
# WAVLM_START_DOCSTRING 变量，包含了关于 WavLM 模型的详细介绍和引用的论文信息
WAVLM_START_DOCSTRING = r"""
    WavLM was proposed in [WavLM: Unified Speech Representation Learning with Labeled and Unlabeled
    Data](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo
    Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian,
    Jian Wu, Michael Zeng, Xiangzhan Yu, Furu Wei.

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving etc.).

    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`WavLMConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# WAVLM_INPUTS_DOCSTRING 变量，此处还未添加具体的文档字符串内容
WAVLM_INPUTS_DOCSTRING = r"""
"""
    Args:
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            # 输入的原始语音波形的浮点值。可以通过加载 `.flac` 或 `.wav` 音频文件得到一个 `List[float]` 或 `numpy.ndarray` 类型的数组。
            # 使用 `AutoProcessor` 进行填充并转换为 `torch.FloatTensor` 类型的张量。详见 [`Wav2Vec2Processor.__call__`]。

        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            # 遮罩，用于避免在填充标记索引上执行卷积和注意力操作。遮罩中的值选择在 `[0, 1]` 范围内：

            # - 1 表示**未遮罩**的标记，
            # - 0 表示**已遮罩**的标记。

            # [什么是注意力遮罩?](../glossary#attention-mask)

            # <Tip warning={true}>
            # 如果相应的处理器具有 `config.return_attention_mask == True`，则应传递 `attention_mask`。对于所有处理器的配置中，`config.return_attention_mask == False` 的模型，在进行批处理推断时应避免传递 `attention_mask` 以避免性能下降。对于这些模型，`input_values` 应仅填充为 0 并传递而不传递 `attention_mask`。请注意，这些模型根据 `input_values` 是否填充会得到略有不同的结果。
            # </Tip>

        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。有关更多细节，请参阅返回的张量中的 `attentions`。

        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。有关更多细节，请参阅返回的张量中的 `hidden_states`。

        return_dict (`bool`, *optional*):
            # 是否返回一个 [`~utils.ModelOutput`] 而不是一个普通的元组。
"""
@add_start_docstrings(
    "The bare WavLM Model transformer outputting raw hidden-states without any specific head on top.",
    WAVLM_START_DOCSTRING,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model 复制而来，将 Wav2Vec2Model 改为 WavLMModel，wav2vec2 改为 wavlm，WAV_2_VEC_2 改为 WAVLM，WavLMBaseModelOutput 改为 Wav2Vec2BaseModelOutput
class WavLMModel(WavLMPreTrainedModel):
    def __init__(self, config: WavLMConfig):
        super().__init__(config)
        self.config = config
        self.feature_extractor = WavLMFeatureEncoder(config)  # 初始化特征提取器
        self.feature_projection = WavLMFeatureProjection(config)  # 初始化特征投影器

        # 如果配置中的 mask_time_prob 大于 0.0 或者 mask_feature_prob 大于 0.0，则模型需要掩码向量
        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())  # 初始化掩码特征向量

        # 根据配置选择稳定层归一化编码器或一般编码器
        if config.do_stable_layer_norm:
            self.encoder = WavLMEncoderStableLayerNorm(config)  # 初始化稳定层归一化编码器
        else:
            self.encoder = WavLMEncoder(config)  # 初始化一般编码器

        self.adapter = WavLMAdapter(config) if config.add_adapter else None  # 根据配置选择是否添加适配器

        # 初始化权重并应用最终处理
        self.post_init()

    def freeze_feature_extractor(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其在训练过程中不会更新其参数。
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    def freeze_feature_encoder(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其在训练过程中不会更新其参数。
        """
        self.feature_extractor._freeze_parameters()

    def _mask_hidden_states(
        self,
        hidden_states: torch.FloatTensor,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,
"""
    ):
        """
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        """

        # `config.apply_spec_augment` can set masking to False
        # 检查配置中的 `apply_spec_augment` 是否为 True，如果不是，则直接返回隐藏状态
        if not getattr(self.config, "apply_spec_augment", True):
            return hidden_states

        # generate indices & apply SpecAugment along time axis
        batch_size, sequence_length, hidden_size = hidden_states.size()

        if mask_time_indices is not None:
            # apply SpecAugment along time axis with given mask_time_indices
            # 如果给定了 mask_time_indices，则使用这些索引应用 SpecAugment 到时间轴上的隐藏状态
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
        elif self.config.mask_time_prob > 0 and self.training:
            # 根据配置中的概率生成 mask_time_indices，并应用 SpecAugment 到时间轴上的隐藏状态
            mask_time_indices = _compute_mask_indices(
                (batch_size, sequence_length),
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)

        if self.config.mask_feature_prob > 0 and self.training:
            # generate indices & apply SpecAugment along feature axis
            # 根据配置中的概率生成 mask_feature_indices，并应用 SpecAugment 到特征轴上的隐藏状态
            mask_feature_indices = _compute_mask_indices(
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
            hidden_states[mask_feature_indices] = 0

        return hidden_states

    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=Wav2Vec2BaseModelOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_EXPECTED_OUTPUT_SHAPE,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        mask_time_indices: Optional[torch.FloatTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
        # 如果输出注意力值未指定，则使用配置中的默认值
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 如果输出隐藏状态未指定，则使用配置中的默认值
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 如果返回字典未指定，则使用配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 提取输入特征向量
        extract_features = self.feature_extractor(input_values)
        # 调整特征向量的维度顺序
        extract_features = extract_features.transpose(1, 2)

        if attention_mask is not None:
            # 计算与特征向量对应的减少的注意力掩码
            attention_mask = self._get_feature_vector_attention_mask(
                extract_features.shape[1], attention_mask, add_adapter=False
            )

        # 对特征向量进行特征投影
        hidden_states, extract_features = self.feature_projection(extract_features)
        # 根据给定的时间索引和注意力掩码屏蔽隐藏状态
        hidden_states = self._mask_hidden_states(
            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
        )

        # 使用编码器处理隐藏状态和注意力掩码
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取编码器的最后隐藏状态
        hidden_states = encoder_outputs[0]

        # 如果存在适配器模块，应用适配器
        if self.adapter is not None:
            hidden_states = self.adapter(hidden_states)

        # 如果不要求返回字典形式的输出，返回一个元组
        if not return_dict:
            return (hidden_states, extract_features) + encoder_outputs[1:]

        # 否则，返回一个 Wav2Vec2BaseModelOutput 对象
        return Wav2Vec2BaseModelOutput(
            last_hidden_state=hidden_states,
            extract_features=extract_features,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@add_start_docstrings(
    """WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
    WAVLM_START_DOCSTRING,
)
# 使用装饰器 `add_start_docstrings` 添加模型的文档字符串，描述了该模型的用途和特性
# 从 `transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC` 复制而来，修改为 `WavLMForCTC`，并进行了相应的符号和名称替换
class WavLMForCTC(WavLMPreTrainedModel):
    def __init__(self, config, target_lang: Optional[str] = None):
        # 调用父类的初始化方法，传入配置信息
        super().__init__(config)

        # 初始化 WavLM 模型
        self.wavlm = WavLMModel(config)
        # 添加一个 dropout 层
        self.dropout = nn.Dropout(config.final_dropout)

        # 设置目标语言属性
        self.target_lang = target_lang

        # 检查配置中是否定义了词汇表大小，如果没有则引发错误
        if config.vocab_size is None:
            raise ValueError(
                f"You are trying to instantiate {self.__class__} with a configuration that "
                "does not define the vocabulary size of the language model head. Please "
                "instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
                "or define `vocab_size` of your model's configuration."
            )
        
        # 根据配置决定输出隐藏层大小
        output_hidden_size = (
            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
        )
        # 添加一个线性层作为语言模型的输出层
        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)

        # 初始化权重并进行最终处理
        self.post_init()

    def tie_weights(self):
        """
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        """
        
        # 覆盖 `PreTrainedModel.tie_weights` 方法，以便在传递 `target_lang=...` 给 `from_pretrained(...)` 时能正确加载适配器权重

        # 注意，通常 `tie_weights` 用于绑定输入和输出嵌入权重。在这里重新用于正确加载 WavLM 的适配器层，以避免为 `PreTrainedModel` 引入新的 API。
        # 虽然有些许 hacky，但是 WavLM 永远不必绑定输入和输出嵌入，因此在这里重新用这个函数是可以接受的。

        # 获取目标语言
        target_lang = self.target_lang

        # 如果 `target_lang` 不为 `None`，且 `config.adapter_attn_dim` 未定义，则引发错误
        if target_lang is not None and getattr(self.config, "adapter_attn_dim", None) is None:
            raise ValueError(f"Cannot pass `target_lang`: {target_lang} if `config.adapter_attn_dim` is not defined.")
        # 如果 `target_lang` 为 `None`，但 `config.adapter_attn_dim` 已定义，则记录信息提示默认设置为 'eng'
        elif target_lang is None and getattr(self.config, "adapter_attn_dim", None) is not None:
            logger.info("By default `target_lang` is set to 'eng'.")
        # 如果 `target_lang` 不为 `None`，则加载适配器
        elif target_lang is not None:
            self.load_adapter(target_lang, force_load=True)
    # 调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新。
    def freeze_feature_extractor(self):
        # 发出警告信息，提醒方法 `freeze_feature_extractor` 将在 Transformers v5 中删除，
        # 建议使用等效的 `freeze_feature_encoder` 方法。
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        # 调用 `freeze_feature_encoder` 方法冻结特征编码器的参数。
        self.freeze_feature_encoder()

    # 调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新。
    def freeze_feature_encoder(self):
        # 调用特征编码器内部的方法 `_freeze_parameters`，冻结其参数。
        self.wavlm.feature_extractor._freeze_parameters()

    # 调用此函数将禁用基础模型的梯度计算，使其参数在训练期间不会更新，仅更新分类头部。
    def freeze_base_model(self):
        # 遍历语音语言模型 `wavlm` 的所有参数，并将其 `requires_grad` 属性设为 False。
        for param in self.wavlm.parameters():
            param.requires_grad = False

    # 重写了 `forward` 方法，并应用了两个装饰器 `add_start_docstrings_to_model_forward` 和 `add_code_sample_docstrings`。
    # 这些装饰器用于向 `forward` 方法添加文档字符串，提供了模型输入、输出和示例代码的描述。
    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
        expected_output=_CTC_EXPECTED_OUTPUT,
        expected_loss=_CTC_EXPECTED_LOSS,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, CausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        """
        # 设置返回字典，如果未提供，则使用配置中的返回字典设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用wavlm模型，传入输入值和额外的参数，并获取输出
        outputs = self.wavlm(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的隐藏状态，并应用dropout进行正则化
        hidden_states = outputs[0]
        hidden_states = self.dropout(hidden_states)

        # 将隐藏状态输入到语言模型头部以获取预测的logits
        logits = self.lm_head(hidden_states)

        # 初始化损失为None
        loss = None
        if labels is not None:
            # 检查标签是否超出词汇表大小，如果是则引发值错误
            if labels.max() >= self.config.vocab_size:
                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")

            # 根据注意力掩码获取输入长度
            attention_mask = (
                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
            )
            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

            # 假设填充的标记用-100表示未被关注时
            # 创建标签掩码以指示有效的标签位置和计算目标长度
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            # 对logits进行log_softmax处理，并进行维度变换
            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            # 使用ctc_loss计算损失，确保不启用fp16计算
            with torch.backends.cudnn.flags(enabled=False):
                loss = nn.functional.ctc_loss(
                    log_probs,
                    flattened_targets,
                    input_lengths,
                    target_lengths,
                    blank=self.config.pad_token_id,
                    reduction=self.config.ctc_loss_reduction,
                    zero_infinity=self.config.ctc_zero_infinity,
                )

        # 如果不需要返回字典，则构建输出元组
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 返回CausalLMOutput对象，封装损失、logits、隐藏状态和注意力张量
        return CausalLMOutput(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
# 定义一个带有顶部序列分类头部的 WavLM 模型，用于类似 SUPERB 关键词检测任务的应用
@add_start_docstrings(
    """
    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    """,
    WAVLM_START_DOCSTRING,
)
class WavLMForSequenceClassification(WavLMPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 如果配置允许使用适配器且配置为真，则引发值错误，因为序列分类不支持 WavLM 适配器
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)"
            )
        
        # 初始化 WavLM 模型
        self.wavlm = WavLMModel(config)
        
        # 计算层数，包括变换器层和输入嵌入
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        
        # 如果配置使用加权层求和，则初始化层权重
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 用于投影的线性层，将隐藏大小投影到分类器投影大小
        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
        
        # 分类器线性层，将分类器投影大小映射到类别数
        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_extractor 复制而来
    def freeze_feature_extractor(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新。
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder 复制而来
    def freeze_feature_encoder(self):
        """
        调用此函数将禁用特征编码器的梯度计算，使其参数在训练期间不会更新。
        """
        self.wavlm.feature_extractor._freeze_parameters()

    # 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_base_model 复制而来
    def freeze_base_model(self):
        """
        调用此函数将禁用基础模型的梯度计算，使其参数在训练期间不会更新。只有分类头部将会更新。
        """
        for param in self.wavlm.parameters():
            param.requires_grad = False

    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
    )
    # 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward复制过来，替换Wav2Vec2为WavLM，wav2vec2为wavlm
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        labels: Optional[torch.Tensor] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 确定是否返回字典格式的输出，若未指定则使用配置中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据配置决定是否输出隐藏状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用wavlm模型进行正向传播
        outputs = self.wavlm(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置中指定使用加权层求和，则对隐藏状态进行加权求和操作
        if self.config.use_weighted_layer_sum:
            # 获取隐藏状态
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            # 在指定维度上堆叠隐藏状态
            hidden_states = torch.stack(hidden_states, dim=1)
            # 计算加权层的softmax权重
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            # 对隐藏状态进行加权求和操作
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接使用第一个输出作为隐藏状态
            hidden_states = outputs[0]

        # 将加权求和后的隐藏状态投影到目标维度
        hidden_states = self.projector(hidden_states)
        
        # 如果没有指定attention_mask，则将隐藏状态进行均值池化操作
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            # 否则，根据给定的attention_mask计算padding_mask
            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
            # 将非padding位置的隐藏状态设置为0
            hidden_states[~padding_mask] = 0.0
            # 对padding后的隐藏状态进行求和并除以padding_mask的求和得到均值池化的结果
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)

        # 将池化后的输出传入分类器得到logits
        logits = self.classifier(pooled_output)

        # 初始化损失为None
        loss = None
        # 如果给定了labels，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        # 如果不返回字典格式的输出，则按顺序返回logits和隐藏状态列表
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 如果返回字典格式的输出，则使用SequenceClassifierOutput对象包装结果并返回
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 用于在顶部添加模型文档字符串，描述该模型是在音频帧分类任务上带有分类头的WavLM模型
@add_start_docstrings(
    """
    WavLM Model with a frame classification head on top for tasks like Speaker Diarization.
    """,
    WAVLM_START_DOCSTRING,
)
# 从transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification复制而来，将Wav2Vec2->WavLM，wav2vec2->wavlm，WAV_2_VEC_2->WAVLM
class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 如果配置中有add_adapter属性且为True，则引发值错误，因为音频帧分类不支持使用WavLM适配器
        if hasattr(config, "add_adapter") and config.add_adapter:
            raise ValueError(
                "Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)"
            )
        
        # 初始化WavLM模型
        self.wavlm = WavLMModel(config)
        
        # 计算层数，包括变压器层和输入嵌入层
        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
        
        # 如果配置中使用加权层求和，则初始化层权重
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
        
        # 分类器层，将隐藏状态大小映射到类标签数量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.num_labels = config.num_labels

        # 初始化模型权重
        self.init_weights()

    # Deprecated警告，已弃用freeze_feature_extractor方法，请使用freeze_feature_encoder代替
    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()

    # 冻结特征编码器，禁止特征编码器参数的梯度计算，使其在训练过程中不会更新
    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.wavlm.feature_extractor._freeze_parameters()

    # 冻结基础模型，禁止基础模型参数的梯度计算，使其在训练过程中不会更新，仅分类头会更新
    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.wavlm.parameters():
            param.requires_grad = False

    # 为模型前向传播方法添加模型输入的文档字符串，引用WAVLM_INPUTS_DOCSTRING，并提供代码示例文档字符串
    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_FRAME_CLASS_CHECKPOINT,
        output_type=TokenClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_FRAME_EXPECTED_OUTPUT,
    )
    def forward(
        self,
        input_values: Optional[torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
    ) -> Union[Tuple, TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 确定是否使用返回字典，如果未指定则使用配置中的设置
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据配置决定是否输出隐藏状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用语言模型的前向传播，获取模型的输出结果
        outputs = self.wavlm(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置指定使用加权层求和，则处理隐藏状态
        if self.config.use_weighted_layer_sum:
            # 获取模型输出中的隐藏状态
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            # 将隐藏状态堆叠在一起
            hidden_states = torch.stack(hidden_states, dim=1)
            # 计算加权层的权重并进行softmax归一化
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            # 对隐藏状态进行加权求和
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            # 否则直接使用模型输出的第一个隐藏状态
            hidden_states = outputs[0]

        # 使用分类器对隐藏状态进行分类预测
        logits = self.classifier(hidden_states)

        # 初始化损失为None
        loss = None
        # 如果提供了标签，则计算损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # 计算交叉熵损失
            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))

        # 如果不要求返回字典，则返回分类器的输出和隐藏状态
        if not return_dict:
            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
            return output

        # 否则返回一个TokenClassifierOutput对象，包括损失、预测的logits、隐藏状态和注意力权重
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
# 从 transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss 复制而来，定义了一个 AMSoftmaxLoss 类
class AMSoftmaxLoss(nn.Module):
    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
        super(AMSoftmaxLoss, self).__init__()
        self.scale = scale  # 缩放参数，用于调整余弦相似度的范围
        self.margin = margin  # 间隔参数，用于调整余弦相似度与边界的距离
        self.num_labels = num_labels  # 标签类别数量
        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)  # 损失函数使用的权重参数
        self.loss = nn.CrossEntropyLoss()  # 使用交叉熵作为损失函数

    def forward(self, hidden_states, labels):
        labels = labels.flatten()  # 将标签展平为一维张量
        weight = nn.functional.normalize(self.weight, dim=0)  # 对权重进行 L2 归一化
        hidden_states = nn.functional.normalize(hidden_states, dim=1)  # 对隐藏状态进行 L2 归一化
        cos_theta = torch.mm(hidden_states, weight)  # 计算余弦相似度
        psi = cos_theta - self.margin  # 计算带有间隔的余弦相似度

        onehot = nn.functional.one_hot(labels, self.num_labels)  # 将标签转换为独热编码
        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)  # 根据标签和间隔调整后的余弦相似度计算最终的 logits
        loss = self.loss(logits, labels)  # 计算损失值

        return loss


# 从 transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer 复制而来，定义了一个 TDNNLayer 类
class TDNNLayer(nn.Module):
    def __init__(self, config, layer_id=0):
        super().__init__()
        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]  # 输入维度
        self.out_conv_dim = config.tdnn_dim[layer_id]  # 输出维度
        self.kernel_size = config.tdnn_kernel[layer_id]  # 卷积核大小
        self.dilation = config.tdnn_dilation[layer_id]  # 膨胀率

        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)  # 线性层作为卷积核
        self.activation = nn.ReLU()  # ReLU 激活函数

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        if is_peft_available():  # 检查是否可用 peft 库
            from peft.tuners.lora import LoraLayer  # 导入 LoraLayer

            if isinstance(self.kernel, LoraLayer):  # 如果 kernel 是 LoraLayer 类型
                warnings.warn(
                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
                    "You should exclude TDNNLayer from LoRA's target modules.",
                )

        # 为了向后兼容性，保留 nn.Linear，但调用 F.conv1d 以提高速度
        hidden_states = hidden_states.transpose(1, 2)  # 转置隐藏状态的维度
        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)  # 调整权重的维度
        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)  # 使用 conv1d 进行卷积操作
        hidden_states = hidden_states.transpose(1, 2)  # 再次转置隐藏状态的维度

        hidden_states = self.activation(hidden_states)  # 应用 ReLU 激活函数
        return hidden_states


@add_start_docstrings(
    """
    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    """,
    WAVLM_START_DOCSTRING,
)
# 从 transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector 复制而来，定义了一个 WavLMForXVector 类，用于 XVector 特征提取
class WavLMForXVector(WavLMPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)  # 调用父类的初始化方法，传递配置参数给父类

        self.wavlm = WavLMModel(config)  # 创建一个语音语言模型对象
        num_layers = config.num_hidden_layers + 1  # 计算层的数量：变换器层 + 输入嵌入层
        if config.use_weighted_layer_sum:
            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)  # 如果配置启用了加权层求和，则创建一个权重参数
        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])  # 创建一个线性层投影器

        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]  # 创建一系列TDNN层
        self.tdnn = nn.ModuleList(tdnn_layers)  # 将TDNN层存储在模块列表中

        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)  # 创建特征提取器的线性层
        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)  # 创建分类器的线性层

        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)  # 创建AMSoftmax损失函数对象

        self.init_weights()  # 初始化模型权重

    def freeze_feature_extractor(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        warnings.warn(
            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
            "Please use the equivalent `freeze_feature_encoder` method instead.",
            FutureWarning,
        )
        self.freeze_feature_encoder()  # 警告已弃用此方法，建议使用等效的 `freeze_feature_encoder` 方法

    def freeze_feature_encoder(self):
        """
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        """
        self.wavlm.feature_extractor._freeze_parameters()  # 冻结特征编码器的参数，禁用其在训练期间的梯度计算

    def freeze_base_model(self):
        """
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        """
        for param in self.wavlm.parameters():
            param.requires_grad = False  # 禁用基础模型的梯度计算，使其参数在训练期间不会更新。仅更新分类头部的参数。

    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
        """
        Computes the output length of the TDNN layers
        """

        def _conv_out_length(input_length, kernel_size, stride):
            # 1D convolutional layer output length formula taken
            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
            return (input_length - kernel_size) // stride + 1  # 计算1D卷积层的输出长度公式

        for kernel_size in self.config.tdnn_kernel:
            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)  # 遍历TDNN内核大小，计算输入长度的输出长度

        return input_lengths

    @add_start_docstrings_to_model_forward(WAVLM_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_XVECTOR_CHECKPOINT,
        output_type=XVectorOutput,
        config_class=_CONFIG_FOR_DOC,
        modality="audio",
        expected_output=_XVECTOR_EXPECTED_OUTPUT,
    )
    # 定义一个方法 `forward`，用于模型前向传播
    def forward(
        # 输入值，可以是一个 PyTorch 张量，可选参数
        self,
        input_values: Optional[torch.Tensor],
        # 注意力掩码，用于指定模型注意力分布，可选参数
        attention_mask: Optional[torch.Tensor] = None,
        # 是否输出注意力分布，可选参数，默认为 None
        output_attentions: Optional[bool] = None,
        # 是否输出隐藏状态，可选参数，默认为 None
        output_hidden_states: Optional[bool] = None,
        # 是否返回一个字典作为输出，可选参数，默认为 None
        return_dict: Optional[bool] = None,
        # 标签数据，可选参数，用于某些任务的监督学习
        labels: Optional[torch.Tensor] = None,
        ) -> Union[Tuple, XVectorOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        # 根据返回参数设置是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 根据配置决定是否输出隐藏层状态
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        # 调用 WAVLM 模型进行语音识别任务的计算
        outputs = self.wavlm(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 如果配置中使用加权层求和，则对隐藏状态进行加权求和操作
        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            hidden_states = outputs[0]

        # 对隐藏状态进行投影
        hidden_states = self.projector(hidden_states)

        # 通过一系列 TDNN 层处理隐藏状态特征
        for tdnn_layer in self.tdnn:
            hidden_states = tdnn_layer(hidden_states)

        # 统计池化操作
        if attention_mask is None:
            # 如果没有注意力掩码，则对隐藏状态在第一维上进行均值和标准差计算
            mean_features = hidden_states.mean(dim=1)
            std_features = hidden_states.std(dim=1)
        else:
            # 根据注意力掩码计算特征提取器输出的长度
            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
            mean_features = []
            std_features = []
            for i, length in enumerate(tdnn_output_lengths):
                # 对每个序列进行统计池化操作
                mean_features.append(hidden_states[i, :length].mean(dim=0))
                std_features.append(hidden_states[i, :length].std(dim=0))
            mean_features = torch.stack(mean_features)
            std_features = torch.stack(std_features)
        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)

        # 通过特征提取器得到最终的输出特征向量
        output_embeddings = self.feature_extractor(statistic_pooling)
        # 使用分类器进行最终的分类预测
        logits = self.classifier(output_embeddings)

        # 计算损失
        loss = None
        if labels is not None:
            loss = self.objective(logits, labels)

        # 根据返回参数决定返回值的组织方式
        if not return_dict:
            # 如果不使用返回字典，则返回元组形式的结果
            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss,) + output) if loss is not None else output

        # 使用自定义的输出类构造返回结果
        return XVectorOutput(
            loss=loss,
            logits=logits,
            embeddings=output_embeddings,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

`.\models\wavlm\init.py`

# 引入所需的类型检查模块
from typing import TYPE_CHECKING

# 引入必要的依赖异常和延迟加载模块
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available

# 定义模块导入结构
_import_structure = {"configuration_wavlm": ["WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "WavLMConfig"]}

# 检查是否存在 torch 库，如果不存在则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果存在 torch 库，则添加以下模型相关的导入结构
    _import_structure["modeling_wavlm"] = [
        "WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST",
        "WavLMForAudioFrameClassification",
        "WavLMForCTC",
        "WavLMForSequenceClassification",
        "WavLMForXVector",
        "WavLMModel",
        "WavLMPreTrainedModel",
    ]

# 如果类型检查开启
if TYPE_CHECKING:
    # 从配置文件中导入所需的配置映射和配置类
    from .configuration_wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig

    # 再次检查是否存在 torch 库，如果不存在则捕获异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 从模型文件中导入所需的模型类
        from .modeling_wavlm import (
            WAVLM_PRETRAINED_MODEL_ARCHIVE_LIST,
            WavLMForAudioFrameClassification,
            WavLMForCTC,
            WavLMForSequenceClassification,
            WavLMForXVector,
            WavLMModel,
            WavLMPreTrainedModel,
        )

# 如果不是类型检查模式
else:
    import sys

    # 使用延迟加载模块，定义当前模块为 LazyModule
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\whisper\configuration_whisper.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
# 上述为版权声明和编码声明

# 导入必要的模块和类
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union

# 导入预训练配置类和相关的配置
from ...configuration_utils import PretrainedConfig
from ...onnx import OnnxConfig, OnnxSeq2SeqConfigWithPast
from ...utils import logging

# 如果是类型检查模式，则导入额外的类
if TYPE_CHECKING:
    from ...feature_extraction_utils import FeatureExtractionMixin
    from ...tokenization_utils_base import PreTrainedTokenizerBase
    from ...utils import TensorType

# 获取日志记录器
logger = logging.get_logger(__name__)

# 预训练模型配置文件映射字典
WHISPER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "openai/whisper-base": "https://huggingface.co/openai/whisper-base/resolve/main/config.json",
}

# fmt: off
# 定义非语音标记的列表
NON_SPEECH_TOKENS = [
    1, 2, 7, 8, 9, 10, 14, 25,
    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
    63, 90, 91, 92, 93, 357, 366, 438, 532, 685,
    705, 796, 930, 1058, 1220, 1267, 1279, 1303, 1343, 1377,
    1391, 1635, 1782, 1875, 2162, 2361, 2488, 3467, 4008, 4211,
    4600, 4808, 5299, 5855, 6329, 7203, 9609, 9959, 10563, 10786,
    11420, 11709, 11907, 13163, 13697, 13700, 14808, 15306, 16410, 16791,
    17992, 19203, 19510, 20724, 22305, 22935, 27007, 30109, 30420, 33409,
    34949, 40283, 40493, 40549, 47282, 49146, 50257, 50359, 50360, 50361
]
# 定义多模态的非语音标记的列表
NON_SPEECH_TOKENS_MULTI = [
    1, 2, 7, 8, 9, 10, 14, 25,
    26, 27, 28, 29, 31, 58, 59, 60, 61, 62,
    63, 90, 91, 92, 93, 359, 503, 522, 542, 873,
    893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627,
    3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647,
    7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793,
    14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675,
    22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865,
    42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362
]
# fmt: on

class WhisperConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`WhisperModel`]. It is used to instantiate a
    Whisper model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the Whisper
    [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    rest of this class docstring for more information.
    """
    # 定义模型类型为 "whisper"
    model_type = "whisper"
    
    # 推理阶段忽略的关键字列表
    keys_to_ignore_at_inference = ["past_key_values"]
    
    # 属性映射字典，将 PretrainedConfig 中的属性映射到本地属性
    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}

    # 初始化函数，设置模型的各种配置参数
    def __init__(
        self,
        vocab_size=51865,  # 词汇表大小，默认为 51865
        num_mel_bins=80,  # MEL bins 数量，默认为 80
        encoder_layers=4,  # 编码器层数，默认为 4
        encoder_attention_heads=6,  # 编码器注意力头数，默认为 6
        decoder_layers=4,  # 解码器层数，默认为 4
        decoder_attention_heads=6,  # 解码器注意力头数，默认为 6
        decoder_ffn_dim=1536,  # 解码器 FFN 维度，默认为 1536
        encoder_ffn_dim=1536,  # 编码器 FFN 维度，默认为 1536
        encoder_layerdrop=0.0,  # 编码器层丢弃率，默认为 0.0
        decoder_layerdrop=0.0,  # 解码器层丢弃率，默认为 0.0
        decoder_start_token_id=50257,  # 解码器起始 token ID，默认为 50257
        use_cache=True,  # 是否使用缓存，默认为 True
        is_encoder_decoder=True,  # 是否是编码器-解码器模型，默认为 True
        activation_function="gelu",  # 激活函数类型，默认为 "gelu"
        d_model=384,  # 模型维度，默认为 384
        dropout=0.0,  # 全局 dropout 率，默认为 0.0
        attention_dropout=0.0,  # 注意力 dropout 率，默认为 0.0
        activation_dropout=0.0,  # 激活函数 dropout 率，默认为 0.0
        init_std=0.02,  # 参数初始化标准差，默认为 0.02
        scale_embedding=False,  # 是否对嵌入进行缩放，默认为 False
        max_source_positions=1500,  # 最大源序列长度，默认为 1500
        max_target_positions=448,  # 最大目标序列长度，默认为 448
        pad_token_id=50256,  # 填充 token ID，默认为 50256
        bos_token_id=50256,  # 起始 token ID，默认为 50256
        eos_token_id=50256,  # 结束 token ID，默认为 50256
        suppress_tokens=None,  # 要抑制的特定 token 列表，默认为 None
        begin_suppress_tokens=[220, 50256],  # 开始抑制的 token 列表，默认为 [220, 50256]
        use_weighted_layer_sum=False,  # 是否使用加权层求和，默认为 False
        classifier_proj_size=256,  # 分类器投影大小，默认为 256
        apply_spec_augment=False,  # 是否应用语音增强，默认为 False
        mask_time_prob=0.05,  # 时间掩码概率，默认为 0.05
        mask_time_length=10,  # 时间掩码长度，默认为 10
        mask_time_min_masks=2,  # 时间掩码最小数量，默认为 2
        mask_feature_prob=0.0,  # 特征掩码概率，默认为 0.0
        mask_feature_length=10,  # 特征掩码长度，默认为 10
        mask_feature_min_masks=0,  # 特征掩码最小数量，默认为 0
        median_filter_width=7,  # 中值滤波器宽度，默认为 7
        **kwargs,  # 其他关键字参数
        ):
        # 初始化模型参数
        self.vocab_size = vocab_size  # 词汇表大小
        self.num_mel_bins = num_mel_bins  # 梅尔频谱的频道数
        self.d_model = d_model  # 模型的维度大小
        self.encoder_layers = encoder_layers  # 编码器层数
        self.encoder_attention_heads = encoder_attention_heads  # 编码器注意力头数
        self.decoder_layers = decoder_layers  # 解码器层数
        self.decoder_attention_heads = decoder_attention_heads  # 解码器注意力头数
        self.decoder_ffn_dim = decoder_ffn_dim  # 解码器前馈网络的维度
        self.encoder_ffn_dim = encoder_ffn_dim  # 编码器前馈网络的维度
        self.dropout = dropout  # 总体dropout概率
        self.attention_dropout = attention_dropout  # 注意力机制中的dropout概率
        self.activation_dropout = activation_dropout  # 激活函数中的dropout概率
        self.activation_function = activation_function  # 激活函数类型
        self.init_std = init_std  # 参数初始化的标准差
        self.encoder_layerdrop = encoder_layerdrop  # 编码器层的LayerDrop比例
        self.decoder_layerdrop = decoder_layerdrop  # 解码器层的LayerDrop比例
        self.use_cache = use_cache  # 是否使用缓存
        self.num_hidden_layers = encoder_layers  # 隐藏层的数量（与编码器层数相同）
        self.scale_embedding = scale_embedding  # 若为True，则嵌入的缩放因子为sqrt(d_model)
        self.max_source_positions = max_source_positions  # 源序列的最大位置编码数
        self.max_target_positions = max_target_positions  # 目标序列的最大位置编码数

        # 音频分类特定参数，其他情况下可忽略
        self.classifier_proj_size = classifier_proj_size  # 分类器投影的维度
        self.use_weighted_layer_sum = use_weighted_layer_sum  # 是否使用加权层求和

        # SpecAugment的微调配置参数：https://arxiv.org/abs/1904.08779
        self.apply_spec_augment = apply_spec_augment  # 是否应用SpecAugment
        self.mask_time_prob = mask_time_prob  # 时间掩码的概率
        self.mask_time_length = mask_time_length  # 时间掩码的长度
        self.mask_time_min_masks = mask_time_min_masks  # 时间掩码的最小数量
        self.mask_feature_prob = mask_feature_prob  # 特征掩码的概率
        self.mask_feature_length = mask_feature_length  # 特征掩码的长度
        self.mask_feature_min_masks = mask_feature_min_masks  # 特征掩码的最小数量

        self.median_filter_width = median_filter_width  # 中值滤波器的宽度

        # 调用父类的初始化方法
        super().__init__(
            pad_token_id=pad_token_id,  # 填充token的ID
            bos_token_id=bos_token_id,  # 开始token的ID
            eos_token_id=eos_token_id,  # 结束token的ID
            is_encoder_decoder=is_encoder_decoder,  # 是否为编码-解码模型
            decoder_start_token_id=decoder_start_token_id,  # 解码器起始token的ID
            suppress_tokens=suppress_tokens,  # 需要抑制的token
            begin_suppress_tokens=begin_suppress_tokens,  # 开始抑制的token
            **kwargs,  # 其他关键字参数
        )
# 定义一个名为 WhisperOnnxConfig 的类，继承自 OnnxSeq2SeqConfigWithPast 类
class WhisperOnnxConfig(OnnxSeq2SeqConfigWithPast):

    # 定义一个属性方法 inputs，返回一个字典，描述模型的输入结构
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 创建一个有序字典 common_inputs，包含模型的常见输入
        common_inputs = OrderedDict(
            [
                ("input_features", {0: "batch", 1: "feature_size", 2: "encoder_sequence"}),
            ]
        )
        
        # 根据 self.use_past 的值决定是否添加 decoder_input_ids 到 common_inputs 中
        if self.use_past:
            common_inputs["decoder_input_ids"] = {0: "batch"}
        else:
            common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}

        # 根据 self.use_past 的值，调用 fill_with_past_key_values_ 方法填充 common_inputs
        if self.use_past:
            self.fill_with_past_key_values_(common_inputs, direction="inputs")

        # 返回描述模型输入的字典 common_inputs
        return common_inputs

    # 定义一个方法 generate_dummy_inputs，生成用于测试的虚拟输入数据
    def generate_dummy_inputs(
        self,
        preprocessor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"],
        batch_size: int = -1,
        seq_length: int = -1,
        is_pair: bool = False,
        framework: Optional["TensorType"] = None,
        sampling_rate: int = 22050,
        time_duration: float = 5.0,
        frequency: int = 220,
    ) -> Mapping[str, Any]:
        # 创建一个有序字典 dummy_inputs，用于存储虚拟输入数据
        dummy_inputs = OrderedDict()
        
        # 调用 OnnxConfig 类的 generate_dummy_inputs 方法生成 encoder_inputs
        encoder_inputs = OnnxConfig.generate_dummy_inputs(
            self,
            preprocessor=preprocessor.feature_extractor,
            batch_size=batch_size,
            framework=framework,
            sampling_rate=sampling_rate,
            time_duration=time_duration,
            frequency=frequency,
        )
        
        # 计算 encoder_inputs 的 encoder_sequence_length
        encoder_sequence_length = encoder_inputs["input_features"].shape[2]
        
        # 根据 self.use_past 的值更新 seq_length
        seq_length = encoder_sequence_length // 2 if self.use_past else seq_length
        
        # 调用父类的 generate_dummy_inputs 方法生成 decoder_inputs
        decoder_inputs = super().generate_dummy_inputs(
            preprocessor.tokenizer, batch_size, seq_length, is_pair, framework
        )

        # 将 encoder_inputs 的 input_features 移动到 dummy_inputs 中
        dummy_inputs["input_features"] = encoder_inputs.pop("input_features")
        
        # 将 decoder_inputs 的 decoder_input_ids 移动到 dummy_inputs 中
        dummy_inputs["decoder_input_ids"] = decoder_inputs.pop("decoder_input_ids")

        # 如果 decoder_inputs 中有 past_key_values，则将其移动到 dummy_inputs 中
        if "past_key_values" in decoder_inputs:
            dummy_inputs["past_key_values"] = decoder_inputs.pop("past_key_values")

        # 返回包含虚拟输入数据的 dummy_inputs 字典
        return dummy_inputs

    # 定义一个属性方法 atol_for_validation，返回用于验证的容差值
    @property
    def atol_for_validation(self) -> float:
        return 1e-3

`.\models\whisper\convert_openai_to_hf.py`

#!/usr/bin/env python
"""Converts a Whisper model in OpenAI format to Hugging Face format."""
# 版本和许可声明
# 版权 2022 年由 Hugging Face Inc. 团队和 OpenAI 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本许可，除非符合许可证的要求，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件，
# 不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

import argparse  # 导入命令行参数解析库
import io  # 导入用于处理字节流的库
import json  # 导入 JSON 格式处理库
import os  # 导入操作系统相关的功能库
import tempfile  # 导入临时文件和目录创建库
import urllib  # 导入处理 URL 的库
import warnings  # 导入警告处理库
from typing import Any, Optional, Tuple  # 引入类型提示支持

import torch  # 导入 PyTorch 深度学习库
from huggingface_hub.utils import insecure_hashlib  # 导入 Hugging Face Hub 的哈希函数支持
from torch import nn  # 导入 PyTorch 的神经网络模块
from tqdm import tqdm  # 导入进度条显示库

from transformers import (  # 导入 Hugging Face Transformers 库中的多个模块和类
    GenerationConfig,  # 生成配置类
    WhisperConfig,  # Whisper 模型配置类
    WhisperFeatureExtractor,  # Whisper 特征提取器类
    WhisperForConditionalGeneration,  # Whisper 条件生成模型类
    WhisperProcessor,  # Whisper 处理器类
    WhisperTokenizer,  # Whisper 分词器类
    WhisperTokenizerFast,  # 快速版 Whisper 分词器类
)
from transformers.models.whisper.tokenization_whisper import LANGUAGES, bytes_to_unicode  # 导入 Whisper 分词相关的常量和函数
from transformers.utils.import_utils import _is_package_available  # 导入 Hugging Face Transformers 的包是否可用函数

_MODELS = {  # 预定义 Whisper 模型的下载链接字典
    "tiny.en": "https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt",
    "tiny": "https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt",
    "base.en": "https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt",
    "base": "https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt",
    "small.en": "https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt",
    "small": "https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt",
    "medium.en": "https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt",
    "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
    "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
    "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
    "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
}

_TOKENIZERS = {  # 预定义 Whisper 分词器的配置字典
    # 定义一个包含两个键值对的字典，用于存储语言模型的不同版本的 Token 文件的 URL
    "multilingual": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken",
    "english": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken",
}


def _get_generation_config(
    is_multilingual: bool,
    num_languages: int = 100,
    openai_version: Optional[str] = None,
) -> GenerationConfig:
    """
    Loads the appropriate generation config from HF repo based on provided parameters.

    Args:
        is_multilingual (bool): Flag indicating if multilingual model is used.
        num_languages (int, optional): Number of languages for the model (default is 100).
        openai_version (Optional[str], optional): Version of OpenAI model to load (default is None).

    Returns:
        GenerationConfig: Config object for generation.
    """
    if openai_version is not None:
        repo = f"openai/whisper-{openai_version}"
    elif not is_multilingual:
        repo = "openai/whisper-medium.en"
    elif num_languages < 100:
        repo = "openai/whisper-large-v2"
    else:
        repo = "openai/whisper-large-v3"

    gen_cfg = GenerationConfig.from_pretrained(repo)

    if openai_version is None:
        gen_cfg.alignment_heads = None
        warnings.warn(
            "Alignment heads have not been included in the generation config, since they are available "
            "only for the original OpenAI checkpoints."
            "If you want to use word-level timestamps with a custom version of Whisper,"
            "see https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb"
            "for the example of how to produce word-level timestamps manually."
        )

    return gen_cfg


def remove_ignore_keys_(state_dict):
    """
    Remove specific keys from the provided state_dict.

    Args:
        state_dict (dict): Dictionary containing the model's state.

    Returns:
        None
    """
    ignore_keys = ["layers", "blocks"]
    for k in ignore_keys:
        state_dict.pop(k, None)


WHISPER_MAPPING = {
    "blocks": "layers",
    "mlp.0": "fc1",
    "mlp.2": "fc2",
    "mlp_ln": "final_layer_norm",
    ".attn.query": ".self_attn.q_proj",
    ".attn.key": ".self_attn.k_proj",
    ".attn.value": ".self_attn.v_proj",
    ".attn_ln": ".self_attn_layer_norm",
    ".attn.out": ".self_attn.out_proj",
    ".cross_attn.query": ".encoder_attn.q_proj",
    ".cross_attn.key": ".encoder_attn.k_proj",
    ".cross_attn.value": ".encoder_attn.v_proj",
    ".cross_attn_ln": ".encoder_attn_layer_norm",
    ".cross_attn.out": ".encoder_attn.out_proj",
    "decoder.ln.": "decoder.layer_norm.",
    "encoder.ln.": "encoder.layer_norm.",
    "token_embedding": "embed_tokens",
    "encoder.positional_embedding": "encoder.embed_positions.weight",
    "decoder.positional_embedding": "decoder.embed_positions.weight",
    "ln_post": "layer_norm",
}


def rename_keys(s_dict):
    """
    Rename keys in the provided dictionary according to pre-defined mapping.

    Args:
        s_dict (dict): Dictionary whose keys need to be renamed.

    Returns:
        dict: Dictionary with renamed keys.
    """
    keys = list(s_dict.keys())
    for key in keys:
        new_key = key
        for k, v in WHISPER_MAPPING.items():
            if k in key:
                new_key = new_key.replace(k, v)

        print(f"{key} -> {new_key}")

        s_dict[new_key] = s_dict.pop(key)
    return s_dict


def make_linear_from_emb(emb):
    """
    Create a linear layer from an embedding layer.

    Args:
        emb (nn.Embedding): Embedding layer.

    Returns:
        nn.Linear: Linear layer initialized with the same weights as the embedding.
    """
    vocab_size, emb_size = emb.weight.shape
    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
    lin_layer.weight.data = emb.weight.data
    return lin_layer


def _download(url: str, root: str) -> Any:
    """
    Download a file from a URL to a specified directory.

    Args:
        url (str): URL of the file to download.
        root (str): Directory where the file should be saved.

    Returns:
        Any: Not explicitly returned value.
    """
    os.makedirs(root, exist_ok=True)
    filename = os.path.basename(url)

    expected_sha256 = url.split("/")[-2]
    download_target = os.path.join(root, filename)

    if os.path.exists(download_target) and not os.path.isfile(download_target):
        raise RuntimeError(f"{download_target} exists and is not a regular file")
    # 如果下载目标文件已存在
    if os.path.isfile(download_target):
        # 读取下载目标文件的全部内容
        model_bytes = open(download_target, "rb").read()
        # 使用不安全的哈希算法计算文件内容的 SHA256 值，并检查是否与预期的哈希值匹配
        if insecure_hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
            # 如果匹配，则将文件内容作为字节流加载为 Torch 模型并返回
            return torch.load(io.BytesIO(model_bytes))
        else:
            # 如果不匹配，则发出警告，提示哈希值不匹配，需要重新下载文件
            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")

    # 使用 urllib 请求下载指定 URL 的文件，保存到 download_target
    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
        # 使用 tqdm 显示下载进度，设置总大小、显示宽度、单位等参数
        with tqdm(
            total=int(source.info().get("Content-Length")), ncols=80, unit="iB", unit_scale=True, unit_divisor=1024
        ) as loop:
            while True:
                # 从网络源读取数据块到缓冲区
                buffer = source.read(8192)
                # 如果缓冲区为空则退出循环
                if not buffer:
                    break

                # 将读取的数据块写入到输出文件
                output.write(buffer)
                # 更新 tqdm 进度条，增加已写入数据块的大小
                loop.update(len(buffer))

    # 重新读取下载后的目标文件内容
    model_bytes = open(download_target, "rb").read()
    # 再次使用不安全的哈希算法计算文件内容的 SHA256 值，并检查是否与预期的哈希值匹配
    if insecure_hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
        # 如果不匹配，则抛出运行时错误，提示下载的模型文件哈希值不匹配，需要重新尝试加载模型
        raise RuntimeError(
            "Model has been downloaded but the SHA256 checksum does not match. Please retry loading the model."
        )

    # 将文件内容作为字节流加载为 Torch 模型并返回
    return torch.load(io.BytesIO(model_bytes))
# 从 OpenAI Whisper 模型的检查点转换为 Transformers 模型格式
def convert_openai_whisper_to_tfms(
    checkpoint_path, pytorch_dump_folder_path
) -> Tuple[WhisperForConditionalGeneration, bool, int]:
    # 检查检查点文件路径是否以 ".pt" 结尾，若不是则下载原始检查点
    if ".pt" not in checkpoint_path:
        # 获取 pytorch_dump_folder_path 的父目录作为下载位置，如果不存在则使用当前目录
        root = os.path.dirname(pytorch_dump_folder_path) or "."
        # 下载指定模型的原始检查点文件
        original_checkpoint = _download(_MODELS[checkpoint_path], root)
        # 获取 OpenAI 模型的版本号
        openai_version = checkpoint_path
    else:
        # 使用 torch.load 加载指定路径的 PyTorch 检查点文件到 CPU
        original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
        openai_version = None

    # 从原始检查点中获取维度信息
    dimensions = original_checkpoint["dims"]
    # 获取模型的状态字典
    state_dict = original_checkpoint["model_state_dict"]
    # 获取解码器的 token embedding 权重
    proj_out_weights = state_dict["decoder.token_embedding.weight"]
    # 移除忽略的键值对
    remove_ignore_keys_(state_dict)
    # 重命名模型中的键名
    rename_keys(state_dict)
    # 标志是否绑定 token embedding
    tie_embeds = True
    # 获取解码器中第一个全连接层的维度
    ffn_dim = state_dict["decoder.layers.0.fc1.weight"].shape[0]

    # 通过判断 vocab 大小来设置 bos/eos/pad token 的 id
    endoftext_id = 50257 if dimensions["n_vocab"] > 51865 else 50256

    # 创建 WhisperConfig 对象，配置模型的参数
    config = WhisperConfig(
        vocab_size=dimensions["n_vocab"],
        encoder_ffn_dim=ffn_dim,
        decoder_ffn_dim=ffn_dim,
        num_mel_bins=dimensions["n_mels"],
        d_model=dimensions["n_audio_state"],
        max_target_positions=dimensions["n_text_ctx"],
        encoder_layers=dimensions["n_audio_layer"],
        encoder_attention_heads=dimensions["n_audio_head"],
        decoder_layers=dimensions["n_text_layer"],
        decoder_attention_heads=dimensions["n_text_head"],
        max_source_positions=dimensions["n_audio_ctx"],
        eos_token_id=endoftext_id,
        bos_token_id=endoftext_id,
        pad_token_id=endoftext_id,
        decoder_start_token_id=endoftext_id + 1,
    )

    # 创建 WhisperForConditionalGeneration 模型对象
    model = WhisperForConditionalGeneration(config)
    # 加载模型的状态字典，并检查是否有丢失的参数
    missing, unexpected = model.model.load_state_dict(state_dict, strict=False)
    # 如果有丢失的参数且不在允许的缺失列表中，则抛出 ValueError
    if len(missing) > 0 and not set(missing) <= {
        "encoder.embed_positions.weights",
        "decoder.embed_positions.weights",
    }:
        raise ValueError(
            "Only `encoder.embed_positions.weights` and `decoder.embed_positions.weights`  are allowed to be missing,"
            f" but all the following weights are missing {missing}"
        )

    # 如果 tie_embeds 为 True，则从 embed_tokens 创建线性投影层
    if tie_embeds:
        model.proj_out = make_linear_from_emb(model.model.decoder.embed_tokens)
    else:
        # 否则直接使用给定的 proj_out_weights 作为投影层的权重
        model.proj_out.weight.data = proj_out_weights

    # 根据模型检查点确定模型的生成配置，参考 Whisper 代码库的实现
    is_multilingual = model.config.vocab_size >= 51865
    num_languages = model.config.vocab_size - 51765 - int(is_multilingual)

    # 设置模型的生成配置
    model.generation_config = _get_generation_config(
        is_multilingual,
        num_languages,
        openai_version,
    )

    # 返回转换后的模型对象、是否多语言模型和语言数量
    return model, is_multilingual, num_languages


# 从 https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960 适配而来
def _bpe(mergeable_ranks, token: bytes, max_rank=None) -> list[bytes]:
    # 将字节型 token 拆分为单独的字节部分
    parts = [bytes([b]) for b in token]
    # 返回拆分后的字节列表
    return parts
    # 进入无限循环，直到不再能够合并的情况
    while True:
        # 初始化最小索引和最小合并等级
        min_idx = None
        min_rank = None
        # 遍历 parts 列表中每对相邻元素的索引和元素组成的元组
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            # 获取当前两个元素可以合并的等级
            rank = mergeable_ranks.get(pair[0] + pair[1])
            # 如果可以合并的等级存在，并且比当前的最小等级小，则更新最小索引和最小等级
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx = i
                min_rank = rank
        # 如果找不到可以合并的等级，或者当前最小等级大于等于指定的最大等级，则跳出循环
        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
            break
        # 确保最小索引不为空
        assert min_idx is not None
        # 合并 parts 列表中最小索引处的两个元素，并更新 parts 列表
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]
    # 返回合并完成后的 parts 列表
    return parts
def convert_tiktoken_bpe_to_hf(tiktoken_url: str):
    # 载入指定 URL 的 TikToken BPE 排名数据
    bpe_ranks = load_tiktoken_bpe(tiktoken_url)
    # 创建字节到 Unicode 字符的映射
    byte_encoder = bytes_to_unicode()

    # 将字节表示的 Token 转换为字符串
    def token_bytes_to_string(b):
        return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])

    # 初始化空列表和字典以保存词汇表和合并规则
    merges = []
    vocab = {}

    # 遍历 BPE 排名数据，将 Token 和其排名转换为字符串形式加入词汇表
    for token, rank in bpe_ranks.items():
        vocab[token_bytes_to_string(token)] = rank
        # 如果 Token 长度为 1，跳过后续步骤
        if len(token) == 1:
            continue
        # 通过 _bpe 函数获取合并后的 Token 对，并将其转换为字符串形式后加入合并规则列表
        merged = tuple(_bpe(bpe_ranks, token, max_rank=rank))
        if len(merged) == 2:  # 考虑空 Token
            merges.append(" ".join(map(token_bytes_to_string, merged)))

    # 返回词汇表和合并规则列表
    return vocab, merges


def convert_tiktoken_to_hf(
    multilingual: bool = True, num_languages: int = 100, time_precision=0.02
) -> WhisperTokenizer:
    # 根据多语言选项确定使用的 TikToken 文件路径
    tiktoken_tokenizer_path = _TOKENIZERS["multilingual" if multilingual else "english"]
    # 定义转录开始标记和控制标记列表
    start_of_transcript = ["<|endoftext|>", "<|startoftranscript|>"]
    control_tokens = [
        "<|translate|>",
        "<|transcribe|>",
        "<|startoflm|>",
        "<|startofprev|>",
        "<|nospeech|>",
        "<|notimestamps|>",
    ]
    # 定义非标准化的语言标记列表
    language_tokens = [f"<|{k}|>" for k in list(LANGUAGES)[:num_languages]]
    # 定义标准化的时间戳标记列表
    timestamp_tokens = [("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)]

    # 转换 TikToken 到 Hugging Face 格式的词汇表和合并规则
    vocab, merges = convert_tiktoken_bpe_to_hf(tiktoken_tokenizer_path)

    # 使用临时目录创建词汇表和合并规则文件，并初始化 WhisperTokenizer 对象
    with tempfile.TemporaryDirectory() as tmpdirname:
        vocab_file = f"{tmpdirname}/vocab.json"
        merge_file = f"{tmpdirname}/merges.txt"
        
        # 将词汇表写入 JSON 文件
        with open(vocab_file, "w", encoding="utf-8") as f:
            f.write(json.dumps(vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")

        # 将合并规则写入文本文件
        with open(merge_file, "w", encoding="utf-8") as writer:
            writer.write("#version: 0.2\n")
            for bpe_tokens in merges:
                writer.write(bpe_tokens + "\n")

        # 初始化 WhisperTokenizer 对象，加载词汇表和合并规则文件
        hf_tokenizer = WhisperTokenizer(vocab_file, merge_file)

    # 向 WhisperTokenizer 对象添加特殊标记和时间戳标记
    hf_tokenizer.add_tokens(start_of_transcript + language_tokens + control_tokens, special_tokens=True)
    hf_tokenizer.add_tokens(timestamp_tokens, special_tokens=False)
    
    # 返回初始化后的 WhisperTokenizer 对象
    return hf_tokenizer


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 必选参数
    parser.add_argument("--checkpoint_path", type=str, help="下载的检查点的路径")
    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="输出 PyTorch 模型的路径.")
    parser.add_argument(
        "--convert_preprocessor",
        type=bool,
        default=False,
        help="是否将预处理器（分词器 + 特征提取器）与模型一起转换.",
    )
    args = parser.parse_args()

    # 转换 OpenAI Whisper 到 TensorFlow Model Specification
    model, is_multilingual, num_languages = convert_openai_whisper_to_tfms(
        args.checkpoint_path, args.pytorch_dump_folder_path
    )
    # 如果命令行参数中包含 convert_preprocessor 标志
    if args.convert_preprocessor:
        try:
            # 检查是否安装了 `tiktoken` 包
            if not _is_package_available("tiktoken"):
                # 如果未安装，抛出异常提醒用户安装 `tiktoken`
                raise """`tiktoken` is not installed, use `pip install tiktoken` to convert the tokenizer"""
        except Exception:
            # 捕获任何异常，不进行处理，继续执行后续代码
            pass
        else:
            # 如果没有抛出异常，导入 `load_tiktoken_bpe` 函数
            from tiktoken.load import load_tiktoken_bpe

            # 根据条件调用 convert_tiktoken_to_hf 函数生成 tokenizer
            tokenizer = convert_tiktoken_to_hf(is_multilingual, num_languages)
            # 创建 WhisperFeatureExtractor 实例，设置特征大小为模型配置的 num_mel_bins 值
            feature_extractor = WhisperFeatureExtractor(
                feature_size=model.config.num_mel_bins,
                # 其余默认参数与 openai/whisper 中硬编码的相同
            )
            # 使用 tokenizer 和 feature_extractor 创建 WhisperProcessor 实例
            processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
            # 将 processor 的预训练结果保存到指定的 pytorch_dump_folder_path 中
            processor.save_pretrained(args.pytorch_dump_folder_path)

            # 同时保存快速 tokenizer
            fast_tokenizer = WhisperTokenizerFast.from_pretrained(args.pytorch_dump_folder_path)
            fast_tokenizer.save_pretrained(args.pytorch_dump_folder_path, legacy_format=False)

    # 将模型的预训练结果保存到指定的 pytorch_dump_folder_path 中
    model.save_pretrained(args.pytorch_dump_folder_path)

`.\models\whisper\english_normalizer.py`

# 版权声明及引用信息
# Copyright 2022 The OpenAI team and The HuggingFace Team. All rights reserved.
# Most of the code is copy pasted from the original whisper repository
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 导入正则表达式和Unicode相关模块
import re
import unicodedata
from fractions import Fraction
from typing import Iterator, List, Match, Optional, Union

# 导入第三方正则表达式模块
import regex

# 非ASCII字母，不通过"NFKD"规范化分隔的其他附加重音符号和特殊字符映射表
ADDITIONAL_DIACRITICS = {
    "œ": "oe",
    "Œ": "OE",
    "ø": "o",
    "Ø": "O",
    "æ": "ae",
    "Æ": "AE",
    "ß": "ss",
    "ẞ": "SS",
    "đ": "d",
    "Đ": "D",
    "ð": "d",
    "Ð": "D",
    "þ": "th",
    "Þ": "th",
    "ł": "l",
    "Ł": "L",
}

def remove_symbols_and_diacritics(s: str, keep=""):
    """
    替换文本中的标记、符号和标点为空格，并且移除所有重音符号（类别为'Mn'）和一些手动映射的特殊字符
    """
    def replace_character(char):
        if char in keep:
            return char
        elif char in ADDITIONAL_DIACRITICS:
            return ADDITIONAL_DIACRITICS[char]
        elif unicodedata.category(char) == "Mn":
            return ""
        elif unicodedata.category(char)[0] in "MSP":
            return " "
        return char

    return "".join(replace_character(c) for c in unicodedata.normalize("NFKD", s))

def remove_symbols(s: str):
    """
    替换文本中的标记、符号和标点为空格，保留重音符号
    """
    return "".join(" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s))

class BasicTextNormalizer:
    """
    文本基础清理类，根据初始化参数移除重音符号和分隔字母
    """
    def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
        self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
        self.split_letters = split_letters

    def __call__(self, s: str):
        s = s.lower()
        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # 移除括号内的单词
        s = re.sub(r"\(([^)]+?)\)", "", s)  # 移除括号内的内容
        s = self.clean(s).lower()

        if self.split_letters:
            s = " ".join(regex.findall(r"\X", s, regex.U))  # 按字母分隔文本

        s = re.sub(r"\s+", " ", s)  # 将连续的空白字符替换为单个空格

        return s

class EnglishNumberNormalizer:
    """
    英文数字标准化类，将文本中的英文数字转换为阿拉伯数字，并保留后缀，如`1960s`, `274th`, `32nd`等
    """
    def __init__(self):
        pass  # 此类不需要额外的初始化

    def __call__(self, s: str):
        """
        对输入的字符串进行处理，替换文本中的英文数字为阿拉伯数字，并保留后缀
        """
        s = s.lower()
        s = re.sub(r"\s+", " ", s)  # 将连续的空白字符替换为单个空格
        return s
    # spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
    # spell out `one` and `ones`
    # interpret successive single-digit numbers as nominal: `one oh one` -> `101`
    """
    This class provides methods for preprocessing and postprocessing text transformations related to numbers and currencies.
    """

    def preprocess(self, s: str):
        # replace "<number> and a half" with "<number> point five"
        results = []

        # Split the input string by the phrase "and a half"
        segments = re.split(r"\band\s+a\s+half\b", s)
        for i, segment in enumerate(segments):
            if len(segment.strip()) == 0:
                continue
            if i == len(segments) - 1:
                results.append(segment)
            else:
                results.append(segment)
                # Check if the last word in the segment is a decimal or a multiplier
                last_word = segment.rsplit(maxsplit=2)[-1]
                if last_word in self.decimals or last_word in self.multipliers:
                    results.append("point five")
                else:
                    results.append("and a half")

        s = " ".join(results)

        # Put a space at number/letter boundary
        s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
        s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)

        # Remove spaces which could be a suffix like "1st", "2nd", etc.
        s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)

        return s

    def postprocess(self, s: str):
        def combine_cents(m: Match):
            try:
                currency = m.group(1)
                integer = m.group(2)
                cents = int(m.group(3))
                # Combine currency, integer part, and cents with correct formatting
                return f"{currency}{integer}.{cents:02d}"
            except ValueError:
                return m.string

        def extract_cents(m: Match):
            try:
                # Extract cents from the matched pattern and format as cents symbol
                return f"¢{int(m.group(1))}"
            except ValueError:
                return m.string

        # Apply currency postprocessing: "$2 and ¢7" -> "$2.07"
        s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
        s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)

        # Replace "1(s)" with "one(s)" for readability
        s = re.sub(r"\b1(s?)\b", r"one\1", s)

        return s

    def __call__(self, s: str):
        # Process input string `s` through preprocessing, word processing, and postprocessing steps
        s = self.preprocess(s)
        s = " ".join(word for word in self.process_words(s.split()) if word is not None)
        s = self.postprocess(s)

        return s
class EnglishSpellingNormalizer:
    """
    Applies British-American spelling mappings as listed in [1].

    [1] https://www.tysto.com/uk-us-spelling-list.html
    """

    def __init__(self, english_spelling_mapping):
        # 初始化时接收一个英语拼写映射的字典
        self.mapping = english_spelling_mapping

    def __call__(self, s: str):
        # 在调用实例时，根据映射替换输入字符串中的单词
        return " ".join(self.mapping.get(word, word) for word in s.split())


class EnglishTextNormalizer:
    def __init__(self, english_spelling_mapping):
        # 忽略的模式，用于匹配需要保留原始形式的词语
        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
        # 替换规则字典，用于将特定模式替换为标准化的形式
        self.replacers = {
            # 常见缩略语
            r"\bwon't\b": "will not",
            r"\bcan't\b": "can not",
            r"\blet's\b": "let us",
            r"\bain't\b": "aint",
            r"\by'all\b": "you all",
            r"\bwanna\b": "want to",
            r"\bgotta\b": "got to",
            r"\bgonna\b": "going to",
            r"\bi'ma\b": "i am going to",
            r"\bimma\b": "i am going to",
            r"\bwoulda\b": "would have",
            r"\bcoulda\b": "could have",
            r"\bshoulda\b": "should have",
            r"\bma'am\b": "madam",
            # 头衔和称谓中的缩略语
            r"\bmr\b": "mister ",
            r"\bmrs\b": "missus ",
            r"\bst\b": "saint ",
            r"\bdr\b": "doctor ",
            r"\bprof\b": "professor ",
            r"\bcapt\b": "captain ",
            r"\bgov\b": "governor ",
            r"\bald\b": "alderman ",
            r"\bgen\b": "general ",
            r"\bsen\b": "senator ",
            r"\brep\b": "representative ",
            r"\bpres\b": "president ",
            r"\brev\b": "reverend ",
            r"\bhon\b": "honorable ",
            r"\basst\b": "assistant ",
            r"\bassoc\b": "associate ",
            r"\blt\b": "lieutenant ",
            r"\bcol\b": "colonel ",
            r"\bjr\b": "junior ",
            r"\bsr\b": "senior ",
            r"\besq\b": "esquire ",
            # 完成时态的缩略语
            r"'d been\b": " had been",
            r"'s been\b": " has been",
            r"'d gone\b": " had gone",
            r"'s gone\b": " has gone",
            r"'d done\b": " had done",  # "'s done" is ambiguous
            r"'s got\b": " has got",
            # 一般缩略语
            r"n't\b": " not",
            r"'re\b": " are",
            r"'s\b": " is",
            r"'d\b": " would",
            r"'ll\b": " will",
            r"'t\b": " not",
            r"'ve\b": " have",
            r"'m\b": " am",
        }
        # 初始化时创建一个用于标准化数字的对象
        self.standardize_numbers = EnglishNumberNormalizer()
        # 初始化时创建一个用于标准化拼写的对象，传入英语拼写映射
        self.standardize_spellings = EnglishSpellingNormalizer(english_spelling_mapping)
    # 定义一个特殊方法 __call__，接受一个字符串参数 s，并将其转换为小写形式
    def __call__(self, s: str):
        # 将字符串 s 转换为小写形式
        s = s.lower()

        # 使用正则表达式去除尖括号或方括号中的内容
        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        
        # 使用正则表达式去除圆括号中的内容
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        
        # 使用预定义的忽略模式列表去除特定模式的内容
        s = re.sub(self.ignore_patterns, "", s)
        
        # 将空格后的撇号标准化为撇号
        s = re.sub(r"\s+'", "'", s)  # standardize when there's a space before an apostrophe

        # 根据预定义的替换规则字典，依次替换字符串 s 中的模式
        for pattern, replacement in self.replacers.items():
            s = re.sub(pattern, replacement, s)

        # 去除数字之间的逗号
        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
        
        # 去除句点后非数字字符的句点
        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
        
        # 调用外部函数 remove_symbols_and_diacritics 去除 s 中的特定符号和变音符号，保留一些特定符号用于数字
        s = remove_symbols_and_diacritics(s, keep=".%$¢€£")  # keep some symbols for numerics

        # 使用对象内定义的方法 standardize_numbers 对 s 中的数字进行标准化处理
        s = self.standardize_numbers(s)
        
        # 使用对象内定义的方法 standardize_spellings 对 s 中的拼写进行标准化处理
        s = self.standardize_spellings(s)

        # 去除非数字前后的前缀/后缀符号，如 . $ ¢ € £
        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
        
        # 去除非数字前的百分号，将其前后加上空格
        s = re.sub(r"([^0-9])%", r"\1 ", s)

        # 将任意连续的空白字符替换为单个空格
        s = re.sub(r"\s+", " ", s)  # replace any successive whitespace characters with a space

        # 返回处理后的字符串 s
        return s

`.\models\whisper\feature_extraction_whisper.py`

# 设置文件编码为 UTF-8，确保支持非英文字符的正确解析
# 版权声明，使用 Apache 许可证 2.0 版本
#
# 根据 Apache 许可证 2.0 版本规定，除非符合许可证要求，否则禁止使用本文件中的代码
#
# 引入必要的库和模块
"""
Feature extractor class for Whisper
"""
# 引入类型提示相关模块
from typing import List, Optional, Union

# 引入 NumPy 库，并使用 np 别名
import numpy as np

# 引入 Hugging Face 提供的 is_torch_available 函数，用于检查是否安装了 Torch
from ... import is_torch_available

# 从 Hugging Face 的 audio_utils 模块中引入 mel_filter_bank、spectrogram 和 window_function 函数
from ...audio_utils import mel_filter_bank, spectrogram, window_function

# 从 feature_extraction_sequence_utils 模块中引入 SequenceFeatureExtractor 类
from ...feature_extraction_sequence_utils import SequenceFeatureExtractor

# 从 feature_extraction_utils 模块中引入 BatchFeature 类
from ...feature_extraction_utils import BatchFeature

# 从 utils 模块中引入 TensorType 和 logging 函数
from ...utils import TensorType, logging

# 如果 Torch 可用，导入 Torch 库
if is_torch_available():
    import torch

# 从 logging 模块中获取 logger 对象，并命名为 __name__，用于日志记录
logger = logging.get_logger(__name__)

# 定义 WhisperFeatureExtractor 类，继承自 SequenceFeatureExtractor 类
class WhisperFeatureExtractor(SequenceFeatureExtractor):
    r"""
    Constructs a Whisper feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, defaults to 160):
            Length of the overlaping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, defaults to 30):
            The maximum number of chuncks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
    """

    # 类变量 model_input_names，指定输入模型的名称为 "input_features"
    model_input_names = ["input_features"]

    # 初始化方法，用于创建 WhisperFeatureExtractor 实例
    def __init__(
        self,
        feature_size=80,
        sampling_rate=16000,
        hop_length=160,
        chunk_length=30,
        n_fft=400,
        padding_value=0.0,
        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
        **kwargs,
    ):
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            padding_value=padding_value,
            return_attention_mask=return_attention_mask,
            **kwargs,
        )
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.chunk_length = chunk_length
        self.n_samples = chunk_length * sampling_rate
        self.nb_max_frames = self.n_samples // hop_length
        self.sampling_rate = sampling_rate
        self.mel_filters = mel_filter_bank(
            num_frequency_bins=1 + n_fft // 2,
            num_mel_filters=feature_size,
            min_frequency=0.0,
            max_frequency=8000.0,
            sampling_rate=sampling_rate,
            norm="slaney",
            mel_scale="slaney",
        )


    def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
        """
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        """
        log_spec = spectrogram(
            waveform,
            window_function(self.n_fft, "hann"),
            frame_length=self.n_fft,
            hop_length=self.hop_length,
            power=2.0,
            mel_filters=self.mel_filters,
            log_mel="log10",
        )
        log_spec = log_spec[:, :-1]  # Remove the last frame to match expected shape
        log_spec = np.maximum(log_spec, log_spec.max() - 8.0)  # Clamp values to ensure numerical stability
        log_spec = (log_spec + 4.0) / 4.0  # Scale values to a range suitable for neural network input
        return log_spec


    def _torch_extract_fbank_features(self, waveform: np.array) -> np.ndarray:
        """
        Compute the log-mel spectrogram of the provided audio using the PyTorch STFT implementation.
        """
        waveform = torch.from_numpy(waveform).type(torch.float32)  # Convert waveform to a PyTorch tensor of float32 type

        window = torch.hann_window(self.n_fft)  # Create a Hann window tensor for STFT
        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)  # Compute STFT of waveform

        magnitudes = stft[..., :-1].abs() ** 2  # Compute magnitude squared of STFT, excluding the last frame

        mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32)  # Convert mel filters to PyTorch tensor
        mel_spec = mel_filters.T @ magnitudes  # Apply mel filters to the magnitude spectrogram

        log_spec = torch.clamp(mel_spec, min=1e-10).log10()  # Apply logarithm after clamping for numerical stability
        log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)  # Clamp values to ensure numerical stability
        log_spec = (log_spec + 4.0) / 4.0  # Scale values to a range suitable for neural network input
        return log_spec.numpy()  # Convert back to NumPy array for compatibility


    @staticmethod
    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
    def zero_mean_unit_var_norm(
        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
    ) -> List[np.ndarray]:
        """
        Every array in the list is normalized to have zero mean and unit variance
        """
        # 如果提供了注意力掩码，则将其转换为 np.int32 类型的数组
        if attention_mask is not None:
            attention_mask = np.array(attention_mask, np.int32)
            # 初始化一个空列表来存储归一化后的输入值
            normed_input_values = []

            # 遍历输入值列表和对应的注意力掩码长度
            for vector, length in zip(input_values, attention_mask.sum(-1)):
                # 计算当前向量的均值和方差，并进行归一化处理
                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
                # 如果当前向量长度小于归一化后的切片长度，则填充指定的 padding_value
                if length < normed_slice.shape[0]:
                    normed_slice[length:] = padding_value

                # 将归一化后的切片添加到结果列表中
                normed_input_values.append(normed_slice)
        else:
            # 如果没有提供注意力掩码，则对输入值列表中的每个数组进行归一化处理
            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]

        # 返回归一化后的输入值列表
        return normed_input_values

    def __call__(
        self,
        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
        truncation: bool = True,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_attention_mask: Optional[bool] = None,
        padding: Optional[str] = "max_length",
        max_length: Optional[int] = None,
        sampling_rate: Optional[int] = None,
        do_normalize: Optional[bool] = None,
        **kwargs,

posted @ 2024-07-01 10:53 绝不原创的飞龙阅读(182) 评论(0) 收藏举报

刷新页面返回顶部

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百二十三-

Transformers 源码解析（一百二十三）

`.\models\wav2vec2_conformer\init.py`

`.\models\wav2vec2_phoneme\tokenization_wav2vec2_phoneme.py`

`.\models\wav2vec2_phoneme\init.py`

`.\models\wav2vec2_with_lm\processing_wav2vec2_with_lm.py`

`.\models\wav2vec2_with_lm\init.py`

`.\models\wavlm\configuration_wavlm.py`

`.\models\wavlm\convert_wavlm_original_pytorch_checkpoint_to_pytorch.py`

`.\models\wavlm\convert_wavlm_original_s3prl_checkpoint_to_pytorch.py`

`.\models\wavlm\modeling_wavlm.py`

`.\models\wavlm\init.py`

`.\models\whisper\configuration_whisper.py`

`.\models\whisper\convert_openai_to_hf.py`

`.\models\whisper\english_normalizer.py`

`.\models\whisper\feature_extraction_whisper.py`

公告

龙哥盟

掠夺·扩张·投机·博弈

Transformers-源码解析-一百二十三-

Transformers 源码解析（一百二十三）

.\models\wav2vec2_conformer\__init__.py

.\models\wav2vec2_phoneme\tokenization_wav2vec2_phoneme.py

.\models\wav2vec2_phoneme\__init__.py

.\models\wav2vec2_with_lm\processing_wav2vec2_with_lm.py

.\models\wav2vec2_with_lm\__init__.py

.\models\wavlm\configuration_wavlm.py

.\models\wavlm\convert_wavlm_original_pytorch_checkpoint_to_pytorch.py

.\models\wavlm\convert_wavlm_original_s3prl_checkpoint_to_pytorch.py

.\models\wavlm\modeling_wavlm.py

.\models\wavlm\__init__.py

.\models\whisper\configuration_whisper.py

.\models\whisper\convert_openai_to_hf.py

.\models\whisper\english_normalizer.py

.\models\whisper\feature_extraction_whisper.py

公告

`.\models\wav2vec2_conformer\init.py`

`.\models\wav2vec2_phoneme\tokenization_wav2vec2_phoneme.py`

`.\models\wav2vec2_phoneme\init.py`

`.\models\wav2vec2_with_lm\processing_wav2vec2_with_lm.py`

`.\models\wav2vec2_with_lm\init.py`

`.\models\wavlm\configuration_wavlm.py`

`.\models\wavlm\convert_wavlm_original_pytorch_checkpoint_to_pytorch.py`

`.\models\wavlm\convert_wavlm_original_s3prl_checkpoint_to_pytorch.py`

`.\models\wavlm\modeling_wavlm.py`

`.\models\wavlm\init.py`

`.\models\whisper\configuration_whisper.py`

`.\models\whisper\convert_openai_to_hf.py`

`.\models\whisper\english_normalizer.py`

`.\models\whisper\feature_extraction_whisper.py`