.\models\deprecated\mmbt\__init__.py
# 版权声明和许可证信息,声明版权归 HuggingFace Team 所有,遵循 Apache License, Version 2.0。
# 可以在符合许可证的前提下使用此文件。
# 获取完整许可证内容,请访问指定的 URL。
#
# 如果当前环境不支持 Torch(PyTorch),则引发 OptionalDependencyNotAvailable 异常。
from typing import TYPE_CHECKING
# 导入可选依赖异常和懒加载模块
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
# 定义导入结构,包含待导入的模块和类名
_import_structure = {"configuration_mmbt": ["MMBTConfig"]}
# 检查当前环境是否支持 Torch,若不支持则抛出 OptionalDependencyNotAvailable 异常
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 如果支持 Torch,则添加额外的导入结构
_import_structure["modeling_mmbt"] = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]
# 如果当前是类型检查模式
if TYPE_CHECKING:
# 导入配置模块中的 MMBTConfig 类
from .configuration_mmbt import MMBTConfig
# 再次检查当前环境是否支持 Torch,若不支持则忽略异常
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 导入建模模块中的 MMBTForClassification, MMBTModel, ModalEmbeddings 类
from .modeling_mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
# 如果当前不是类型检查模式
else:
import sys
# 使用懒加载模块来替代当前模块
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\open_llama\configuration_open_llama.py
# coding=utf-8
# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Open-Llama model configuration"""
# 导入预训练配置类 PretrainedConfig
from ....configuration_utils import PretrainedConfig
# 导入日志工具
from ....utils import logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 定义 Open-Llama 预训练模型配置文件映射字典,指定模型名称及其配置文件的 URL
OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"s-JoL/Open-Llama-V1": "https://huggingface.co/s-JoL/Open-Llama-V1/blob/main/config.json",
}
# OpenLlamaConfig 类,继承自 PretrainedConfig,用于存储 Open-Llama 模型的配置信息
class OpenLlamaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`OpenLlamaModel`]. It is used to instantiate an
Open-Llama model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the
[s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
# 定义函数的参数及其默认取值
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Open-Llama 模型的词汇表大小。定义可以表示的不同标记数量,当调用 [`OpenLlamaModel`] 时传递 `inputs_ids`。
hidden_size (`int`, *optional*, defaults to 4096):
隐藏表示的维度。
intermediate_size (`int`, *optional*, defaults to 11008):
MLP 表示的维度。
num_hidden_layers (`int`, *optional*, defaults to 32):
Transformer 编码器中的隐藏层数量。
num_attention_heads (`int`, *optional*, defaults to 32):
Transformer 编码器中每个 attention 层的注意力头数量。
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
解码器中的非线性激活函数(函数或字符串)。
max_position_embeddings (`int`, *optional*, defaults to 2048):
此模型可能使用的最大序列长度。通常设置一个较大的值(例如 512、1024 或 2048)以防万一。
initializer_range (`float`, *optional*, defaults to 0.02):
用于初始化所有权重矩阵的截断正态初始化器的标准偏差。
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
rms 归一化层使用的 epsilon。
use_cache (`bool`, *optional*, defaults to `True`):
模型是否应返回最后的关键/值注意力(并非所有模型都使用)。只在 `config.is_decoder=True` 时相关。
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
是否绑定权重嵌入
rope_scaling (`Dict`, *optional*):
包含 RoPE 嵌入的缩放配置的字典。目前支持两种缩放策略:线性和动态。它们的缩放因子必须是大于1的浮点数。
期望格式是 `{"type": 策略名称, "factor": 缩放因子}`。在使用此标志时,不更新 `max_position_embeddings` 到预期的新最大值。
更多关于这些缩放策略行为的信息,请参阅以下主题:
https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/。这是一个实验性功能,可能在未来版本中发生重大 API 更改。
# 示例
Example:
```
>>> from transformers import OpenLlamaModel, OpenLlamaConfig
>>> # Initializing a Open-Llama open_llama-7b style configuration
>>> configuration = OpenLlamaConfig()
>>> # Initializing a model from the open_llama-7b style configuration
>>> model = OpenLlamaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
# 设定模型类型为 "open-llama"
model_type = "open-llama"
# 定义初始化方法,接受多个配置参数
def __init__(
self,
vocab_size=100000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
tie_word_embeddings=False,
use_memory_efficient_attention=True,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
use_stable_embedding=True,
shared_input_output_embedding=True,
rope_scaling=None,
**kwargs,
):
# 初始化实例变量
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
# 设置注意事项中的 "use_memory_efficient_attention" 参数,若未提供则使用默认值
self.use_memory_efficient_attention = kwargs.pop(
"use_memorry_efficient_attention", use_memory_efficient_attention
)
# 设置隐藏层和注意力层的 dropout 概率
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_dropout_prob = attention_dropout_prob
# 稳定嵌入使用标志
self.use_stable_embedding = use_stable_embedding
# 共享输入输出嵌入标志
self.shared_input_output_embedding = shared_input_output_embedding
# "rope_scaling" 是可选参数,用于某些定制操作
self.rope_scaling = rope_scaling
# 调用内部方法,验证 "rope_scaling" 参数的有效性
self._rope_scaling_validation()
# 调用父类的初始化方法,传递必要的参数和可能的其他关键字参数
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
# 以下注释标识来自于 transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
# (此处应该有 _rope_scaling_validation 方法的具体实现,但在这段代码中未提供)
def _rope_scaling_validation(self):
"""
Validate the `rope_scaling` configuration.
"""
# 检查是否设置了 `rope_scaling`,如果没有设置则直接返回
if self.rope_scaling is None:
return
# 检查 `rope_scaling` 是否为字典类型且包含两个字段
if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
# 如果不是符合要求的字典类型,则抛出数值错误异常
raise ValueError(
"`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
f"got {self.rope_scaling}"
)
# 获取 `rope_scaling` 字典中的 `type` 和 `factor` 字段
rope_scaling_type = self.rope_scaling.get("type", None)
rope_scaling_factor = self.rope_scaling.get("factor", None)
# 检查 `type` 字段是否为 `linear` 或 `dynamic`
if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
# 如果不是预期的类型,则抛出数值错误异常
raise ValueError(
f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
)
# 检查 `factor` 字段是否为大于 1 的浮点数
if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
# 如果不是预期的浮点数或者小于等于 1,则抛出数值错误异常
raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
.\models\deprecated\open_llama\modeling_open_llama.py
# coding=utf-8
# 版权 2023 年 EleutherAI 和 HuggingFace Inc. 团队保留所有权利。
#
# 此代码基于 EleutherAI 的 GPT-NeoX 库以及此库中的 GPT-NeoX 和 OPT 实现进行了修改,
# 以适应与 Meta AI 团队训练的模型相比的轻微架构差异。
#
# 根据 Apache 许可证 2.0 版(“许可证”)授权;
# 除非符合许可证要求,否则不得使用此文件。
# 您可以在以下网址获取许可证的副本:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件按“原样”分发,
# 不附带任何明示或暗示的保证或条件。
# 有关特定语言的权限,请参阅许可证。
""" PyTorch Open-Llama 模型。"""
import math
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from ....activations import ACT2FN # 导入激活函数映射
from ....modeling_attn_mask_utils import _prepare_4d_causal_attention_mask # 导入注意力掩码相关的工具函数
from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast # 导入模型输出类
from ....modeling_utils import PreTrainedModel # 导入预训练模型基类
from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings # 导入工具函数和日志模块
from .configuration_open_llama import OpenLlamaConfig # 导入 OpenLlama 的配置类
logger = logging.get_logger(__name__) # 获取当前模块的日志记录器对象
try:
from xformers import ops as xops # 尝试导入 xformers 库的操作模块
except ImportError:
xops = None
_CONFIG_FOR_DOC = "OpenLlamaConfig" # 文档中使用的配置名称
# 从 transformers.models.llama.modeling_llama.LlamaRMSNorm 复制而来,将 Llama 改为 OpenLlama
class OpenLlamaRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
OpenLlamaRMSNorm 等效于 T5LayerNorm。
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size)) # 初始化权重参数
self.variance_epsilon = eps # 方差的小值防止除零错误
def forward(self, hidden_states):
input_dtype = hidden_states.dtype # 记录输入张量的数据类型
hidden_states = hidden_states.to(torch.float32) # 张量转换为 float32 类型
variance = hidden_states.pow(2).mean(-1, keepdim=True) # 计算方差
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) # 标准化
return self.weight * hidden_states.to(input_dtype) # 返回加权标准化后的张量
# 从 transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding 复制而来,将 Mistral 改为 OpenLlama
class OpenLlamaRotaryEmbedding(nn.Module):
# 初始化函数,设置模型的维度、最大位置嵌入长度、基础值和设备
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
# 调用父类初始化方法
super().__init__()
# 设置模型的维度
self.dim = dim
# 设置最大位置嵌入长度
self.max_position_embeddings = max_position_embeddings
# 设置基础值
self.base = base
# 计算频率的倒数,根据维度生成一维向量,然后转换为指定设备的数据类型
inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
# 将频率的倒数注册为缓冲张量,不持久化保存
self.register_buffer("inv_freq", inv_freq, persistent=False)
# 构建余弦和正弦缓存,以便 `torch.jit.trace` 正常工作。
self._set_cos_sin_cache(
seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
)
# 设置余弦和正弦缓存
def _set_cos_sin_cache(self, seq_len, device, dtype):
# 记录已缓存的最大序列长度
self.max_seq_len_cached = seq_len
# 生成从0到最大序列长度的整数张量,类型与频率的倒数相同
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
# 计算频率张量的外积
freqs = torch.outer(t, self.inv_freq)
# 拼接余弦和正弦张量,沿着最后一个维度
emb = torch.cat((freqs, freqs), dim=-1)
# 将余弦和正弦张量注册为缓冲张量,并转换为指定数据类型
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
# 前向传播函数,输入张量 x 和序列长度 seq_len
def forward(self, x, seq_len=None):
# x: [bs, num_attention_heads, seq_len, head_size]
# 如果输入的序列长度大于当前缓存的最大序列长度
if seq_len > self.max_seq_len_cached:
# 更新余弦和正弦缓存
self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
# 返回更新后的余弦和正弦缓存,转换为输入张量 x 的数据类型
return (
self.cos_cached[:seq_len].to(dtype=x.dtype),
self.sin_cached[:seq_len].to(dtype=x.dtype),
)
# 从transformers.models.falcon.modeling_falcon.FalconLinearScalingRotaryEmbedding复制并将Falcon->OpenLlama
class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding扩展了线性缩放。鸣谢Reddit用户/u/kaiokendev"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
# 创建一个torch tensor,其内容为从0到self.max_seq_len_cached-1的整数,设备为device,数据类型为torch.int64,并且根据self.scaling_factor缩放
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
t = t / self.scaling_factor
# 创建一个形状为(seq_len, dim)的频率矩阵,外积操作
freqs = torch.outer(t, self.inv_freq)
# 不同于论文,但使用不同的排列方式以获得相同的计算结果
emb = torch.cat((freqs, freqs), dim=-1)
# 注册缓冲区"cos_cached"和"sin_cached",分别存储emb的余弦和正弦值,转换为dtype类型,并且是非持久性缓冲区
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
# 从transformers.models.falcon.modeling_falcon.FalconDynamicNTKScalingRotaryEmbedding复制并将Falcon->OpenLlama
class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
"""OpenLlamaRotaryEmbedding扩展了动态NTK缩放。鸣谢Reddit用户/u/bloc97和/u/emozilla"""
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
self.scaling_factor = scaling_factor
super().__init__(dim, max_position_embeddings, base, device)
def _set_cos_sin_cache(self, seq_len, device, dtype):
self.max_seq_len_cached = seq_len
# 如果seq_len大于max_position_embeddings,计算新的base和inv_freq
if seq_len > self.max_position_embeddings:
base = self.base * (
(self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
) ** (self.dim / (self.dim - 2))
inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
# 创建一个torch tensor,其内容为从0到self.max_seq_len_cached-1的整数,设备为device,数据类型为torch.int64
t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
# 创建一个形状为(seq_len, dim)的频率矩阵,外积操作
freqs = torch.outer(t, self.inv_freq)
# 不同于论文,但使用不同的排列方式以获得相同的计算结果
emb = torch.cat((freqs, freqs), dim=-1)
# 注册缓冲区"cos_cached"和"sin_cached",分别存储emb的余弦和正弦值,转换为dtype类型,并且是非持久性缓冲区
self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
def rotate_half(x):
"""旋转输入的一半隐藏维度。"""
# 将输入张量x按最后一个维度切片为两部分,第一部分为前一半,第二部分为后一半,然后按最后一个维度连接起来
x1 = x[..., : x.shape[-1] // 2]
x2 = x[..., x.shape[-1] // 2 :]
return torch.cat((-x2, x1), dim=-1)
# 从transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb复制
def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
"""Applies Rotary Position Embedding to the query and key tensors.
Args:
q (`torch.Tensor`): The query tensor.
k (`torch.Tensor`): The key tensor.
cos (`torch.Tensor`): The cosine part of the rotary embedding.
sin (`torch.Tensor`): The sine part of the rotary embedding.
position_ids (`torch.Tensor`):
The position indices of the tokens corresponding to the query and key tensors. For example, this can be
used to pass offsetted position ids when working with a KV-cache.
unsqueeze_dim (`int`, *optional*, defaults to 1):
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
"""
# Unsqueezes the cosine embeddings along the specified dimension to match the shape of q and k
cos = cos[position_ids].unsqueeze(unsqueeze_dim)
# Unsqueezes the sine embeddings along the specified dimension to match the shape of q and k
sin = sin[position_ids].unsqueeze(unsqueeze_dim)
# Applies rotary position embedding to the query tensor q
q_embed = (q * cos) + (rotate_half(q) * sin)
# Applies rotary position embedding to the key tensor k
k_embed = (k * cos) + (rotate_half(k) * sin)
return q_embed, k_embed
class OpenLlamaMLP(nn.Module):
def __init__(
self,
hidden_size: int,
intermediate_size: int,
hidden_act: str,
dropout_prob: float,
):
"""Initialize the OpenLlamaMLP module.
Args:
hidden_size (int): The size of the input and output hidden layers.
intermediate_size (int): The size of the intermediate layer.
hidden_act (str): The activation function to be used in the hidden layers.
dropout_prob (float): The dropout probability for regularization.
"""
super().__init__()
# Linear transformation for the gating mechanism
self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
# Linear transformation for the downsampling projection
self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
# Linear transformation for the upsampling projection
self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
# Activation function for the hidden layers
self.act_fn = ACT2FN[hidden_act]
# Dropout layer for regularization
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x):
# Applies gating projection, activation function, and upsampling projection to input x,
# then applies downsampling projection and returns the result after dropout
out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return self.dropout(out)
class OpenLlamaAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# 初始化函数,接收一个OpenLlamaConfig类型的参数config
def __init__(self, config: OpenLlamaConfig):
# 调用父类的初始化方法
super().__init__()
# 将传入的配置对象保存到实例变量self.config中
self.config = config
# 从配置对象中获取隐藏层大小,并保存到self.hidden_size中
self.hidden_size = config.hidden_size
# 从配置对象中获取注意力头数,并保存到self.num_heads中
self.num_heads = config.num_attention_heads
# 根据隐藏层大小和注意力头数计算每个头的维度,并保存到self.head_dim中
self.head_dim = self.hidden_size // self.num_heads
# 从配置对象中获取最大位置嵌入的大小,并保存到self.max_position_embeddings中
self.max_position_embeddings = config.max_position_embeddings
# 从配置对象中获取注意力机制的dropout概率,并保存到self.dropout_prob中
self.dropout_prob = config.attention_dropout_prob
# 检查隐藏层大小是否能被注意力头数整除
if (self.head_dim * self.num_heads) != self.hidden_size:
# 若不能整除则抛出数值错误异常
raise ValueError(
f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
f" and `num_heads`: {self.num_heads})."
)
# 初始化查询(query)、键(key)、值(value)、输出(output)的线性投影层
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
# 调用内部初始化函数_init_rope()
self._init_rope()
# 从transformers.models.llama.modeling_llama.LlamaAttention._init_rope复制,并替换Llama为OpenLlama
# 初始化RoPE(Rotary Positional Encoding,旋转位置编码)的嵌入
def _init_rope(self):
# 如果配置对象中的rope_scaling为None,则使用OpenLlamaRotaryEmbedding
if self.config.rope_scaling is None:
self.rotary_emb = OpenLlamaRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
base=self.rope_theta,
)
else:
# 否则根据配置选择不同的RoPE嵌入方式
scaling_type = self.config.rope_scaling["type"]
scaling_factor = self.config.rope_scaling["factor"]
if scaling_type == "linear":
self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
elif scaling_type == "dynamic":
self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding(
self.head_dim,
max_position_embeddings=self.max_position_embeddings,
scaling_factor=scaling_factor,
base=self.rope_theta,
)
else:
# 如果配置中的RoPE类型未知,则抛出值错误异常
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
# 将输入的tensor重新形状为(batch_size, seq_len, num_heads, head_dim),并进行维度转置
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
# 前向传播函数定义,接收输入的隐藏状态张量和其他可选参数
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
):
class OpenLlamaDecoderLayer(nn.Module):
def __init__(self, config: OpenLlamaConfig):
super().__init__()
self.hidden_size = config.hidden_size
self.self_attn = OpenLlamaAttention(config=config)
self.mlp = OpenLlamaMLP(
hidden_size=self.hidden_size,
intermediate_size=config.intermediate_size,
hidden_act=config.hidden_act,
dropout_prob=config.hidden_dropout_prob,
)
self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
"""
residual = hidden_states
# Layer normalization on the input states
hidden_states = self.input_layernorm(hidden_states)
# Self Attention mechanism
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
# Residual connection
hidden_states = residual + hidden_states
# Fully Connected Feedforward Network
residual = hidden_states
# Layer normalization after the attention mechanism
hidden_states = self.post_attention_layernorm(hidden_states)
# Multilayer Perceptron transformation
hidden_states = self.mlp(hidden_states)
# Residual connection
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights,)
if use_cache:
outputs += (present_key_value,)
return outputs
OPEN_LLAMA_START_DOCSTRING = r"""
# 这个模型继承自`PreTrainedModel`。查看超类文档可以了解到库实现的通用方法,例如下载或保存模型、调整输入嵌入大小、修剪头部等。
# 这个模型也是一个PyTorch的`torch.nn.Module`子类。可以像使用常规PyTorch模块一样使用它,并参考PyTorch文档了解有关一般用法和行为的所有内容。
# 参数:
# config ([`OpenLlamaConfig`]):
# 包含模型所有参数的模型配置类。使用配置文件初始化模型不会加载与模型关联的权重,仅加载配置。查看[`~PreTrainedModel.from_pretrained`]方法以加载模型权重。
"""
"""
@add_start_docstrings(
"The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
OPEN_LLAMA_START_DOCSTRING,
)
class OpenLlamaPreTrainedModel(PreTrainedModel):
# 使用特定的文档字符串为类添加描述
config_class = OpenLlamaConfig
# 模型中基础模型的名称前缀
base_model_prefix = "model"
# 支持梯度检查点的标志
supports_gradient_checkpointing = True
# 不需要分割的模块名称列表
_no_split_modules = ["OpenLlamaDecoderLayer"]
def _init_weights(self, module):
# 初始化模型权重的函数
std = self.config.initializer_range
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
if self.config.use_stable_embedding:
torch.nn.init.xavier_normal_(module.weight.data)
else:
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
OPEN_LLAMA_INPUTS_DOCSTRING = r"""
"""
@add_start_docstrings(
"The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
OPEN_LLAMA_START_DOCSTRING,
)
class OpenLlamaModel(OpenLlamaPreTrainedModel):
"""
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OpenLlamaDecoderLayer`]
Args:
config: OpenLlamaConfig
"""
def __init__(self, config: OpenLlamaConfig):
super().__init__(config)
self.padding_idx = config.pad_token_id
self.vocab_size = config.vocab_size
# 词嵌入层,根据配置初始化不同的稳定或非稳定嵌入方式
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
if config.use_stable_embedding:
self.embed_layer_norm = nn.LayerNorm(config.hidden_size)
else:
self.embed_layer_norm = None
# 使用配置中的层数初始化解码器层列表
self.layers = nn.ModuleList([OpenLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
# 使用自定义的 RMS 标准化器
self.norm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.gradient_checkpointing = False
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
@add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# OpenLlamaForCausalLM 类的 forward 方法尚未完全添加
# 初始化函数,接受一个配置对象作为参数
def __init__(self, config):
# 调用父类的初始化方法,传入配置对象
super().__init__(config)
# 使用配置对象创建一个 OpenLlamaModel 实例,并赋值给 self.model
self.model = OpenLlamaModel(config)
# 根据配置参数决定是否创建 lm_head 层
if config.shared_input_output_embedding:
self.lm_head = None # 如果配置中设置共享输入输出嵌入,则 lm_head 为 None
else:
# 否则,创建一个线性层,将隐藏大小为 config.hidden_size,输出大小为 config.vocab_size,无偏置
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# 初始化权重并应用最终处理
self.post_init()
# 返回模型的输入嵌入层
def get_input_embeddings(self):
return self.model.embed_tokens
# 设置模型的输入嵌入层
def set_input_embeddings(self, value):
self.model.embed_tokens = value
# 返回模型的输出嵌入层(lm_head)
def get_output_embeddings(self):
return self.lm_head
# 设置模型的输出嵌入层(lm_head)
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
# 设置模型的解码器
def set_decoder(self, decoder):
self.model = decoder
# 返回模型的解码器
def get_decoder(self):
return self.model
# 前向传播函数,用装饰器添加了文档字符串和返回值替换
@add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
):
# 前向传播函数具体实现在后续的方法和类中定义,此处不直接实现
# 为生成准备输入的方法,接受多个参数,包括 input_ids, past_key_values, attention_mask, inputs_embeds
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
):
# 具体的输入准备过程在后续的方法和类中定义,此处不直接实现
):
# 如果之前的键值对不为 None,则获取其长度
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# 一些生成方法可能只传递了最后一个输入 ID
if input_ids.shape[1] > past_length:
# 移除前缀的长度设为过去长度
remove_prefix_length = past_length
else:
# 默认旧行为:仅保留最后一个 ID
remove_prefix_length = input_ids.shape[1] - 1
# 更新输入 IDs 为移除前缀后的部分
input_ids = input_ids[:, remove_prefix_length:]
# 从 kwargs 中获取位置 IDs
position_ids = kwargs.get("position_ids", None)
# 如果存在注意力遮罩但没有位置 IDs
if attention_mask is not None and position_ids is None:
# 为批量生成动态创建位置 IDs
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
# 如果存在过去的键值对,则仅保留与输入 IDs 形状相匹配的位置 IDs
position_ids = position_ids[:, -input_ids.shape[1] :]
# 如果传递了 `inputs_embeds`,则仅在第一个生成步骤中使用它们
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
# 更新 model_inputs 字典
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
# 重新排序过去的键值对
for layer_past in past_key_values:
reordered_past += (
# 对每个层的过去状态根据 beam_idx 重新排序
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
# 使用装饰器添加文档字符串到类上,描述LLaMa模型带有顶部序列分类头部的转换器
# 这里是使用了OpenLlamaForSequenceClassification类,它在顺序分类时使用最后一个token,类似于其他因果模型(如GPT-2)的做法。
# 根据配置的pad_token_id,确定最后一个非填充token的位置以进行分类。如果未定义pad_token_id,则直接取批次中每行的最后一个值。
# 当传入inputs_embeds而非input_ids时,无法猜测填充token,因此同样采用这种方式(取批次中每行的最后一个值)。
class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
def __init__(self, config):
# 调用父类构造函数,传入配置
super().__init__(config)
# 设置类别数目
self.num_labels = config.num_labels
# 初始化OpenLlama模型
self.model = OpenLlamaModel(config)
# 定义线性层,将隐藏状态映射到类别数目,无偏置
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
# 初始化权重并应用最终处理
self.post_init()
# 获取输入嵌入
def get_input_embeddings(self):
return self.model.embed_tokens
# 设置输入嵌入
def set_input_embeddings(self, value):
self.model.embed_tokens = value
# 使用装饰器添加文档字符串到模型前向传播函数上,描述输入参数的文档字符串
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
.\models\deprecated\open_llama\__init__.py
# 导入类型检查工具,用于检查类型是否可用
from typing import TYPE_CHECKING
# 导入必要的依赖项和模块
from ....utils import (
OptionalDependencyNotAvailable,
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
)
# 定义模块的导入结构
_import_structure = {
"configuration_open_llama": ["OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenLlamaConfig"],
}
# 检查是否有 sentencepiece 可用,如果不可用则引发异常
try:
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 如果可用,添加 tokenization_open_llama 到导入结构中
_import_structure["tokenization_open_llama"] = ["LlamaTokenizer"]
# 检查是否有 tokenizers 可用,如果不可用则引发异常
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 如果可用,添加 tokenization_open_llama_fast 到导入结构中
_import_structure["tokenization_open_llama_fast"] = ["LlamaTokenizerFast"]
# 检查是否有 torch 可用,如果不可用则引发异常
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 如果可用,添加 modeling_open_llama 到导入结构中
_import_structure["modeling_open_llama"] = [
"OpenLlamaForCausalLM",
"OpenLlamaModel",
"OpenLlamaPreTrainedModel",
"OpenLlamaForSequenceClassification",
]
# 如果正在进行类型检查
if TYPE_CHECKING:
# 导入配置和类型相关的模块
from .configuration_open_llama import OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenLlamaConfig
try:
# 检查是否有 sentencepiece 可用,如果不可用则忽略
if not is_sentencepiece_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 导入 LlamaTokenizer 类
from transformers import LlamaTokenizer
try:
# 检查是否有 tokenizers 可用,如果不可用则忽略
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 导入 LlamaTokenizerFast 类
from transformers import LlamaTokenizerFast
try:
# 检查是否有 torch 可用,如果不可用则忽略
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 导入 modeling_open_llama 模块中的类
from .modeling_open_llama import (
OpenLlamaForCausalLM,
OpenLlamaForSequenceClassification,
OpenLlamaModel,
OpenLlamaPreTrainedModel,
)
# 如果不是类型检查阶段,则配置 LazyModule 并将其添加到当前模块
else:
import sys
# 使用 LazyModule 进行模块的延迟加载
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\retribert\configuration_retribert.py
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" RetriBERT model configuration"""
# 导入预训练配置类 PretrainedConfig 和日志记录工具 logging
from ....configuration_utils import PretrainedConfig
from ....utils import logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 预训练模型配置文件的映射字典,键为模型名称,值为配置文件的下载链接
RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/config.json"
),
}
# RetriBertConfig 类,继承自 PretrainedConfig,用于存储 RetriBertModel 的配置信息
class RetriBertConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`RetriBertModel`]. It is used to instantiate a
RetriBertModel model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the RetriBERT
[yjernite/retribert-base-uncased](https://huggingface.co/yjernite/retribert-base-uncased) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""
# 定义模型类型为 "retribert"
model_type = "retribert"
# 定义 RetriBertModel 类的构造函数,初始化模型的各种参数
def __init__(
self,
vocab_size=30522, # 词汇表大小,默认为 30522
hidden_size=768, # 编码器层和池化层的维度
num_hidden_layers=8, # Transformer 编码器中隐藏层的数量
num_attention_heads=12, # 每个注意力层中的注意力头数
intermediate_size=3072, # Transformer 编码器中“中间”(通常称为前馈)层的维度
hidden_act="gelu", # 编码器和池化器中的非线性激活函数
hidden_dropout_prob=0.1, # 嵌入层、编码器和池化层中所有全连接层的 dropout 概率
attention_probs_dropout_prob=0.1, # 注意力概率的 dropout 比率
max_position_embeddings=512, # 模型可能使用的最大序列长度
type_vocab_size=2, # 传递给 BertModel 的 token_type_ids 的词汇表大小
initializer_range=0.02, # 用于初始化所有权重矩阵的截断正态分布的标准差
layer_norm_eps=1e-12, # 层归一化层使用的 epsilon
share_encoders=True, # 是否使用相同的 Bert 类型编码器来处理查询和文档
projection_dim=128, # 投影后的查询和文档表示的最终维度
pad_token_id=0, # 用于填充的 token ID
**kwargs, # 其他未命名参数
):
):
# 调用父类的初始化方法,设置填充标记的 ID 和其他可选参数
super().__init__(pad_token_id=pad_token_id, **kwargs)
# 设置词汇表大小
self.vocab_size = vocab_size
# 设置隐藏层大小
self.hidden_size = hidden_size
# 设置隐藏层的数量
self.num_hidden_layers = num_hidden_layers
# 设置注意力头的数量
self.num_attention_heads = num_attention_heads
# 设置隐藏层激活函数类型
self.hidden_act = hidden_act
# 设置中间层大小
self.intermediate_size = intermediate_size
# 设置隐藏层的 dropout 概率
self.hidden_dropout_prob = hidden_dropout_prob
# 设置注意力机制的 dropout 概率
self.attention_probs_dropout_prob = attention_probs_dropout_prob
# 设置最大位置编码的长度
self.max_position_embeddings = max_position_embeddings
# 设置类型词汇表的大小
self.type_vocab_size = type_vocab_size
# 设置初始化范围
self.initializer_range = initializer_range
# 设置层归一化的 epsilon 值
self.layer_norm_eps = layer_norm_eps
# 是否共享编码器的标志
self.share_encoders = share_encoders
# 设置投影维度
self.projection_dim = projection_dim
.\models\deprecated\retribert\modeling_retribert.py
# coding=utf-8
# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
RetriBERT model
"""
import math # 导入数学模块
from typing import Optional # 导入类型提示模块
import torch # 导入PyTorch
import torch.utils.checkpoint as checkpoint # 导入PyTorch的checkpoint模块
from torch import nn # 导入PyTorch的神经网络模块
from ....modeling_utils import PreTrainedModel # 导入预训练模型基类
from ....utils import add_start_docstrings, logging # 导入文档字符串添加和日志记录工具
from ...bert.modeling_bert import BertModel # 导入BERT模型
from .configuration_retribert import RetriBertConfig # 导入RetriBERT的配置类
logger = logging.get_logger(__name__) # 获取当前模块的日志记录器
RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ # 定义RetriBERT预训练模型存档列表
"yjernite/retribert-base-uncased",
# See all RetriBert models at https://huggingface.co/models?filter=retribert
]
# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
class RetriBertPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = RetriBertConfig # 设置配置类为RetriBertConfig
load_tf_weights = None # 不使用TensorFlow权重加载
base_model_prefix = "retribert" # 设置基础模型前缀为retribert
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
RETRIBERT_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
"""
Parameters:
config ([`RetriBertConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
@add_start_docstrings(
"""Bert Based model to embed queries or document for document retrieval.""",
RETRIBERT_START_DOCSTRING,
)
"""
# 使用 `add_start_docstrings` 装饰器添加文档字符串,描述这是一个基于BERT的模型,用于嵌入查询或文档以进行文档检索
class RetriBertModel(RetriBertPreTrainedModel):
def __init__(self, config: RetriBertConfig) -> None:
super().__init__(config)
# 初始化模型,设定投影维度
self.projection_dim = config.projection_dim
# 建立查询BERT模型
self.bert_query = BertModel(config)
# 如果配置中不共享编码器,建立文档BERT模型;否则设为None
self.bert_doc = None if config.share_encoders else BertModel(config)
# 设定Dropout层
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 设定查询投影层
self.project_query = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
# 设定文档投影层
self.project_doc = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
# 设定交叉熵损失函数
self.ce_loss = nn.CrossEntropyLoss(reduction="mean")
# 初始化权重并进行最终处理
self.post_init()
def embed_sentences_checkpointed(
self,
input_ids,
attention_mask,
sent_encoder,
checkpoint_batch_size=-1,
):
# 使用检查点技术重现BERT前向传播
if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
# 如果不使用检查点或者输入量小于检查点批大小,直接进行前向传播
return sent_encoder(input_ids, attention_mask=attention_mask)[1]
else:
# 准备隐式变量
device = input_ids.device
input_shape = input_ids.size()
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
head_mask = [None] * sent_encoder.config.num_hidden_layers
extended_attention_mask: torch.Tensor = sent_encoder.get_extended_attention_mask(
attention_mask, input_shape
)
# 定义用于检查点的函数
def partial_encode(*inputs):
encoder_outputs = sent_encoder.encoder(
inputs[0],
attention_mask=inputs[1],
head_mask=head_mask,
)
sequence_output = encoder_outputs[0]
pooled_output = sent_encoder.pooler(sequence_output)
return pooled_output
# 将所有输入一次性运行嵌入层
embedding_output = sent_encoder.embeddings(
input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, inputs_embeds=None
)
# 每次处理一个小批次的编码和汇总
pooled_output_list = []
for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
pooled_output_list.append(pooled_output)
return torch.cat(pooled_output_list, dim=0)
# Embedding questions by processing input_ids using the specified BERT model (self.bert_query).
# If attention_mask is provided, it's used to mask certain tokens during embedding.
# Utilizes checkpointing if checkpoint_batch_size is specified.
def embed_questions(
self,
input_ids,
attention_mask=None,
checkpoint_batch_size=-1,
):
# Embedding sentences using the checkpointed embedding method with BERT for queries.
q_reps = self.embed_sentences_checkpointed(
input_ids,
attention_mask,
self.bert_query,
checkpoint_batch_size,
)
# Projecting the embedded query representations to a different space if needed.
return self.project_query(q_reps)
# Embedding answers by processing input_ids using either self.bert_query or self.bert_doc BERT models.
# Choice depends on the availability of self.bert_doc; defaults to self.bert_query if not available.
# Utilizes checkpointing if checkpoint_batch_size is specified.
def embed_answers(
self,
input_ids,
attention_mask=None,
checkpoint_batch_size=-1,
):
# Embedding sentences using the checkpointed embedding method with BERT for answers.
a_reps = self.embed_sentences_checkpointed(
input_ids,
attention_mask,
self.bert_query if self.bert_doc is None else self.bert_doc,
checkpoint_batch_size,
)
# Projecting the embedded document representations to a different space.
return self.project_doc(a_reps)
# Forward method for the model, which processes both query and document input_ids with their respective masks.
# Uses the specified BERT models (self.bert_query for queries and self.bert_doc for documents if available).
# Allows for checkpointing if checkpoint_batch_size is specified.
) -> torch.FloatTensor:
r"""
Args:
input_ids_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the queries in a batch.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask_query (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
input_ids_doc (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary for the documents in a batch.
attention_mask_doc (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on documents padding token indices.
checkpoint_batch_size (`int`, *optional*, defaults to `-1`):
If greater than 0, uses gradient checkpointing to only compute sequence representation on
`checkpoint_batch_size` examples at a time on the GPU. All query representations are still compared to
all document representations in the batch.
Return:
`torch.FloatTensor``: The bidirectional cross-entropy loss obtained while trying to match each query to its
corresponding document and each document to its corresponding query in the batch
"""
# 获取输入 query 的设备信息
device = input_ids_query.device
# 生成 query 的表示向量
q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
# 生成 document 的表示向量
a_reps = self.embed_answers(input_ids_doc, attention_mask_doc, checkpoint_batch_size)
# 计算 query 和 document 之间的相似度分数矩阵
compare_scores = torch.mm(q_reps, a_reps.t())
# 计算 query-to-answer 的交叉熵损失
loss_qa = self.ce_loss(compare_scores, torch.arange(compare_scores.shape[1]).to(device))
# 计算 answer-to-query 的交叉熵损失
loss_aq = self.ce_loss(compare_scores.t(), torch.arange(compare_scores.shape[0]).to(device))
# 计算最终的损失,为两种交叉熵损失的平均值
loss = (loss_qa + loss_aq) / 2
# 返回损失值
return loss
.\models\deprecated\retribert\tokenization_retribert.py
# 指定编码格式为 UTF-8
# 版权声明,引用的代码库遵循 Apache License, Version 2.0
# 导入 collections 模块,用于构建数据结构
# 导入 os 模块,提供了一些与操作系统交互的功能
# 导入 unicodedata 模块,用于对 Unicode 字符进行数据库查询
# 导入 typing 模块中的 List、Optional、Tuple 类型
# 从 tokenization_utils 模块中导入 PreTrainedTokenizer 类,用于构建 tokenizer
# 从 tokenization_utils 模块中导入 _is_control、_is_punctuation、_is_whitespace 函数
# 从 utils 模块中导入 logging 函数
# 获取 logger 对象,用于记录日志信息
logger = logging.get_logger(__name__)
# 定义一个字典,指定 VOCAB 文件的名称为 "vocab.txt"
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
# 定义一个字典,映射预训练模型到对应的 VOCAB 文件路径
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt"
),
}
}
# 定义一个字典,映射预训练模型到对应的位置编码大小
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"yjernite/retribert-base-uncased": 512,
}
# 定义一个字典,指定预训练模型的初始化配置
PRETRAINED_INIT_CONFIGURATION = {
"yjernite/retribert-base-uncased": {"do_lower_case": True},
}
# 从 transformers.models.bert.tokenization_bert.load_vocab 复制过来的函数
# 加载给定的词汇表文件到一个有序字典中
def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
with open(vocab_file, "r", encoding="utf-8") as reader:
tokens = reader.readlines()
for index, token in enumerate(tokens):
token = token.rstrip("\n")
vocab[token] = index
return vocab
# 从 transformers.models.bert.tokenization_bert.whitespace_tokenize 复制过来的函数
# 对输入的文本进行基本的空白字符清理和分割
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens
# 定义 RetriBertTokenizer 类,继承自 PreTrainedTokenizer
class RetriBertTokenizer(PreTrainedTokenizer):
r"""
Constructs a RetriBERT tokenizer.
[`RetriBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting
and wordpiece.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
to: this superclass for more information regarding those methods.
"""
# 词汇文件的名称列表,用于指定预训练模型使用的词汇文件名
vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型使用的预训练词汇文件映射
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练模型的最大输入大小,指定了每个模型的最大输入序列长度
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 预训练模型的初始化配置,包含了模型初始化时的各种参数设置
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 模型输入的名称列表,定义了模型输入时所使用的名称
model_input_names = ["input_ids", "attention_mask"]
# 以下代码段为从transformers.models.bert.tokenization_bert.BertTokenizer.__init__中复制而来
def __init__(
self,
vocab_file,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 检查词汇文件是否存在,如果不存在则抛出数值错误异常
if not os.path.isfile(vocab_file):
raise ValueError(
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
" model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
)
# 加载词汇表
self.vocab = load_vocab(vocab_file)
# 构建一个从 id 到 token 的有序字典
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
# 设置是否进行基本的分词操作
self.do_basic_tokenize = do_basic_tokenize
# 如果需要进行基本分词,则初始化 BasicTokenizer 对象
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(
do_lower_case=do_lower_case,
never_split=never_split,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
)
# 初始化 WordpieceTokenizer 对象
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
# 调用父类的初始化方法
super().__init__(
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
@property
# 从 transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case 复制而来
def do_lower_case(self):
# 返回基本分词器的小写标志
return self.basic_tokenizer.do_lower_case
@property
# 从 transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size 复制而来
def vocab_size(self):
# 返回词汇表的大小(词汇量)
return len(self.vocab)
# 从 transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab 复制而来
def get_vocab(self):
# 返回词汇表和添加的特殊 token 编码器的组合字典
return dict(self.vocab, **self.added_tokens_encoder)
# 从 transformers.models.bert.tokenization_bert.BertTokenizer._tokenize 复制而来
def _tokenize(self, text, split_special_tokens=False):
# 初始化分词后的 tokens 列表
split_tokens = []
# 如果需要进行基本分词
if self.do_basic_tokenize:
# 使用基本分词器对文本进行分词
for token in self.basic_tokenizer.tokenize(
text, never_split=self.all_special_tokens if not split_special_tokens else None
):
# 如果 token 在不分割集合中
if token in self.basic_tokenizer.never_split:
split_tokens.append(token)
else:
# 使用 WordpieceTokenizer 对 token 进行进一步分词
split_tokens += self.wordpiece_tokenizer.tokenize(token)
else:
# 直接使用 WordpieceTokenizer 对文本进行分词
split_tokens = self.wordpiece_tokenizer.tokenize(text)
# 返回分词后的 tokens 列表
return split_tokens
# 从词汇表中将 token 转换为对应的 id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
# 从词汇表中将 id 转换为对应的 token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.ids_to_tokens.get(index, self.unk_token)
# 将一系列的 token 转换为单个字符串
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
out_string = " ".join(tokens).replace(" ##", "").strip()
return out_string
# 构建包含特殊 token 的模型输入
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep
# 获取包含特殊 token 的 mask(掩码)
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieves sequence ids from a token list that has no special tokens added. It adds the special tokens according
to the BERT-like model requirements.
Args:
token_ids_0 (`List[int]`):
List of IDs corresponding to the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for BERT.
Returns:
`List[int]`: List of `1` and `0`, with `1` indicating a special token, `0` indicating a regular token.
"""
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
# Check if the token list already has special tokens added
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# If token_ids_1 is provided, construct a mask for sequence pairs
if token_ids_1 is not None:
# Return a list indicating positions of special tokens and sequence tokens
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
# Otherwise, construct a mask for a single sequence
return [1] + ([0] * len(token_ids_0)) + [1]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
# Define special tokens
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If token_ids_1 is None, return a mask for a single sequence
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Otherwise, return a mask for sequence pairs
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
# 保存词汇表到指定目录中的文件,返回保存的文件路径元组
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
# 初始化索引
index = 0
# 检查保存目录是否存在,若存在则拼接文件路径,否则直接使用指定路径
if os.path.isdir(save_directory):
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
else:
vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
# 使用 utf-8 编码打开文件准备写入
with open(vocab_file, "w", encoding="utf-8") as writer:
# 遍历词汇表 self.vocab,按照词汇索引排序
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
# 检查索引是否连续,若不连续则发出警告
if index != token_index:
logger.warning(
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!"
)
index = token_index
# 将词汇写入文件,每个词汇占一行
writer.write(token + "\n")
index += 1
# 返回保存的文件路径元组
return (vocab_file,)
# 从transformers.models.bert.tokenization_bert.BasicTokenizer复制而来的类定义
class BasicTokenizer(object):
"""
构造一个BasicTokenizer对象,用于运行基本的分词操作(如分割标点符号、转换为小写等)。
Args:
do_lower_case (`bool`, *optional*, defaults to `True`):
是否在分词时将输入文本转换为小写。
never_split (`Iterable`, *optional*):
在分词时永远不会被分割的标记集合。仅在`do_basic_tokenize=True`时生效。
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
是否对中文字符进行分词。
对于日语,这应该被禁用(参见这个
[问题](https://github.com/huggingface/transformers/issues/328))。
strip_accents (`bool`, *optional*):
是否去除所有的重音符号。如果未指定此选项,则由`lowercase`的值决定(与原始BERT相同)。
do_split_on_punc (`bool`, *optional*, defaults to `True`):
在某些情况下,我们希望跳过基本的标点分割,以便后续的分词可以捕捉到完整的词语上下文,比如缩写词。
"""
def __init__(
self,
do_lower_case=True,
never_split=None,
tokenize_chinese_chars=True,
strip_accents=None,
do_split_on_punc=True,
):
# 如果`never_split`未提供,则设置为一个空列表
if never_split is None:
never_split = []
# 初始化对象属性
self.do_lower_case = do_lower_case # 是否转换为小写
self.never_split = set(never_split) # 永不分割的标记集合,转换为集合类型
self.tokenize_chinese_chars = tokenize_chinese_chars # 是否分词中文字符
self.strip_accents = strip_accents # 是否去除重音符号
self.do_split_on_punc = do_split_on_punc # 是否在标点符号处分割
# 对文本进行基本的分词操作。用于子词分词,请参见WordPieceTokenizer。
#
# Args:
# never_split (`List[str]`, *optional*): 用于向后兼容的参数。现在直接在基类级别实现
# (参见`PreTrainedTokenizer.tokenize`)。不要分割的标记列表。
def tokenize(self, text, never_split=None):
# 如果给定了never_split参数,则将其与对象属性self.never_split合并,形成新的集合
never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
# 清理文本,去除不必要的字符
text = self._clean_text(text)
# 2018年11月1日添加,用于多语言和中文模型。现在也适用于英语模型,但这并不重要,
# 因为英语模型未在任何中文数据上训练,并且通常不包含任何中文数据
# (英文维基百科中包含一些中文单词,因此词汇表中会包含中文字符)。
if self.tokenize_chinese_chars:
# 对中文字符进行分词处理
text = self._tokenize_chinese_chars(text)
# 将Unicode文本进行规范化处理,防止不同的Unicode编码造成字符被视为不同字符
unicode_normalized_text = unicodedata.normalize("NFC", text)
# 使用空白字符分词
orig_tokens = whitespace_tokenize(unicode_normalized_text)
split_tokens = []
# 遍历每个token
for token in orig_tokens:
# 如果token不在never_split中,则考虑大小写问题和重音符号处理
if token not in never_split:
if self.do_lower_case:
# 如果设置为小写处理,则将token转换为小写
token = token.lower()
# 如果需要去除重音符号,则执行去除操作
if self.strip_accents is not False:
token = self._run_strip_accents(token)
elif self.strip_accents:
# 如果需要去除重音符号,则执行去除操作
token = self._run_strip_accents(token)
# 将分割后的token添加到列表中
split_tokens.extend(self._run_split_on_punc(token, never_split))
# 使用空白字符再次分词,将所有分割后的token重新组合成字符串列表
output_tokens = whitespace_tokenize(" ".join(split_tokens))
# 返回最终的token列表
return output_tokens
# 从文本中去除重音符号
def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
# 将文本标准化为NFD格式,以便处理重音符号
text = unicodedata.normalize("NFD", text)
output = []
# 遍历文本中的每个字符
for char in text:
# 获取字符的Unicode分类
cat = unicodedata.category(char)
# 如果字符属于"Mark, Nonspacing"(重音符号),则跳过该字符
if cat == "Mn":
continue
# 否则将字符添加到输出列表中
output.append(char)
# 将输出列表中的字符重新组合成字符串,并返回
return "".join(output)
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
# 如果不需要在标点处分割或者指定了不分割的文本,则直接返回包含原始文本的列表
if not self.do_split_on_punc or (never_split is not None and text in never_split):
return [text]
# 将文本转换为字符列表
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
# 如果是标点符号,则将其作为单独的列表项加入输出列表,并标记为开始新单词
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
# 如果不是标点符号,根据是否开始新单词决定是追加到当前列表项还是新建列表项
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
# 将列表中的列表项连接为字符串,并返回分割后的文本列表
return ["".join(x) for x in output]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
# 如果字符是CJK字符,则在其前后添加空格,并加入输出列表;否则直接加入输出列表
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
# 将列表转换为字符串并返回
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# 判断字符编码是否在CJK统一表意文字范围内
if (
(cp >= 0x4E00 and cp <= 0x9FFF)
or (cp >= 0x3400 and cp <= 0x4DBF) #
or (cp >= 0x20000 and cp <= 0x2A6DF) #
or (cp >= 0x2A700 and cp <= 0x2B73F) #
or (cp >= 0x2B740 and cp <= 0x2B81F) #
or (cp >= 0x2B820 and cp <= 0x2CEAF) #
or (cp >= 0xF900 and cp <= 0xFAFF)
or (cp >= 0x2F800 and cp <= 0x2FA1F) #
): #
return True
return False
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
# 如果字符编码为0或0xFFFD(无效字符),或者是控制字符,则跳过当前字符
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
# 如果是空白字符,则替换为单个空格;否则保留字符
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
# 将列表转换为字符串并返回
return "".join(output)
# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
# WordpieceTokenizer 类,用于运行 WordPiece 分词算法
class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""
# 初始化方法,设置词汇表、未知标记和单词最大字符数
def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
self.vocab = vocab # 词汇表,包含所有的词和子词
self.unk_token = unk_token # 未知标记,用于未能识别的词或子词
self.max_input_chars_per_word = max_input_chars_per_word # 单词的最大字符数限制,默认为100
# 对文本进行 WordPiece 分词
def tokenize(self, text):
"""
Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
tokenization using the given vocabulary.
For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through *BasicTokenizer*.
Returns:
A list of wordpiece tokens.
"""
# 初始化输出 token 列表
output_tokens = []
# 对文本进行空白字符分割,获取每一个 token
for token in whitespace_tokenize(text):
chars = list(token)
# 如果 token 的字符数超过最大字符数限制,则将未知标记加入输出 tokens
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
# 使用贪婪的最长匹配算法进行分词
while start < len(chars):
end = len(chars)
cur_substr = None
# 从当前位置向前递减,生成子字符串并加上前缀 "##",检查其是否在词汇表中
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
# 如果找不到合适的子字符串,则将 is_bad 标记设为 True
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
# 如果 is_bad 为 True,则将未知标记加入输出 tokens;否则将子 tokens 加入输出 tokens
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
# 返回最终的 wordpiece tokens 列表
return output_tokens
.\models\deprecated\retribert\tokenization_retribert_fast.py
# coding=utf-8
# 版权所有 2018 年 HuggingFace Inc. 团队。
#
# 根据 Apache 许可证 2.0 版本进行许可;
# 除非符合许可证的要求,否则您不得使用此文件。
# 您可以在以下网址获取许可证的副本:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,本软件按“原样”分发,不提供任何形式的明示或暗示担保或条件。
# 有关详细信息,请参阅许可证。
"""RetriBERT 的分词类。"""
import json
from typing import List, Optional, Tuple
from tokenizers import normalizers
# 导入预训练的 tokenizer
from ....tokenization_utils_fast import PreTrainedTokenizerFast
from ....utils import logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 定义词汇文件和 tokenizer 文件的名称
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
# 预训练模型对应的词汇文件和 tokenizer 文件的映射
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt"
),
},
"tokenizer_file": {
"yjernite/retribert-base-uncased": (
"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/tokenizer.json"
),
},
}
# 预训练模型对应的位置嵌入尺寸
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"yjernite/retribert-base-uncased": 512,
}
# 预训练模型的初始化配置
PRETRAINED_INIT_CONFIGURATION = {
"yjernite/retribert-base-uncased": {"do_lower_case": True},
}
class RetriBertTokenizerFast(PreTrainedTokenizerFast):
r"""
构建一个“快速”RetriBERT 分词器(基于 HuggingFace 的 *tokenizers* 库)。
[`RetriBertTokenizerFast`] 与 [`BertTokenizerFast`] 相同,并且支持端到端的分词:标点符号拆分和 wordpiece。
此分词器继承自 [`PreTrainedTokenizerFast`],其中包含大多数主要方法。用户应参考此超类获取有关这些方法的更多信息。
"""
Args:
vocab_file (`str`):
File containing the vocabulary.
do_lower_case (`bool`, *optional*, defaults to `True`):
Whether or not to lowercase the input when tokenizing.
unk_token (`str`, *optional*, defaults to `"[UNK]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (`str`, *optional*, defaults to `"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (`str`, *optional*, defaults to `"[PAD]"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (`str`, *optional*, defaults to `"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (`str`, *optional*, defaults to `"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
clean_text (`bool`, *optional*, defaults to `True`):
Whether or not to clean the text before tokenization by removing any control characters and replacing all
whitespaces by the classic one.
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
The prefix for subwords.
"""
# 以下代码段是变量声明,用于配置和初始化BERT类型的分词器的各种设置和参数
# 文件名列表,指定了与模型相关的词汇表文件名
vocab_files_names = VOCAB_FILES_NAMES
# 预训练模型的词汇表文件映射,指定了不同预训练模型的词汇表文件路径
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 预训练模型输入的最大长度,指定了不同预训练模型的最大输入长度
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 预训练模型的初始化配置,包含了不同预训练模型的初始化参数
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 用于慢速分词器的类,指定了用于BERT类型模型的慢速分词器类
slow_tokenizer_class = RetriBertTokenizer
# 模型输入名称列表,指定了模型的输入ID和注意力掩码
model_input_names = ["input_ids", "attention_mask"]
# 以下代码段的功能与transformers库中的BertTokenizerFast.__init__方法相同,但未完全展示
# 初始化方法,用于创建一个新的实例对象
def __init__(
self,
vocab_file=None, # 词汇表文件路径,默认为None
tokenizer_file=None, # 分词器文件路径,默认为None
do_lower_case=True, # 是否将输入文本转换为小写,默认为True
unk_token="[UNK]", # 未知标记的字符串表示,默认为"[UNK]"
sep_token="[SEP]", # 分隔标记的字符串表示,默认为"[SEP]"
pad_token="[PAD]", # 填充标记的字符串表示,默认为"[PAD]"
cls_token="[CLS]", # 类别标记的字符串表示,默认为"[CLS]"
mask_token="[MASK]", # 掩码标记的字符串表示,默认为"[MASK]"
tokenize_chinese_chars=True, # 是否分词中文字符,默认为True
strip_accents=None, # 是否去除重音,默认为None
**kwargs, # 其他关键字参数
):
# 调用父类的初始化方法
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 获取后端分词器的正常化状态
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 检查正常化状态是否与当前初始化参数匹配,若不匹配则更新分词器的正常化类
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 设置实例对象的小写处理状态
self.do_lower_case = do_lower_case
# 从给定的token_ids_0和可选的token_ids_1构建包含特殊标记的模型输入序列,用于序列分类任务
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERT sequence has the following format:
- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
# 构建包含特殊标记的输入序列,以用于模型输入
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# 若存在第二个token_ids_1,则构建双序列的输入格式
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
# 根据token_ids_0和可选的token_ids_1创建token类型ID序列
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def create_bert_seq_classification_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of token IDs for the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional list of token IDs for the second sequence in sequence pairs.
Returns:
`List[int]`: List of token type IDs according to the given sequence(s).
"""
# Define the separator and classification tokens
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If there is no second sequence, return a mask with all zeros for the first sequence
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Return a mask that identifies the token type IDs for both sequences
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer's vocabulary to files in the specified directory.
Args:
save_directory (str):
Directory where vocabulary files will be saved.
filename_prefix (str, *optional*):
Optional prefix for the saved vocabulary filenames.
Returns:
Tuple[str]: Tuple containing the filenames of the saved vocabulary files.
"""
# Save the vocabulary files using the underlying tokenizer model
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
.\models\deprecated\retribert\__init__.py
# 导入必要的模块和函数
from typing import TYPE_CHECKING
# 从工具包中导入必要的异常和延迟加载模块
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tokenizers_available, is_torch_available
# 定义模块的导入结构
_import_structure = {
"configuration_retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig"],
"tokenization_retribert": ["RetriBertTokenizer"],
}
# 检查是否存在 tokenizers 库,若不存在则抛出异常
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 若存在则添加快速 tokenization_retribert_fast 模块的导入结构
_import_structure["tokenization_retribert_fast"] = ["RetriBertTokenizerFast"]
# 检查是否存在 torch 库,若不存在则抛出异常
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 若存在则添加 modeling_retribert 模块的导入结构
_import_structure["modeling_retribert"] = [
"RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
"RetriBertModel",
"RetriBertPreTrainedModel",
]
# 如果是类型检查阶段
if TYPE_CHECKING:
# 从相关模块导入必要的类和变量
from .configuration_retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig
from .tokenization_retribert import RetriBertTokenizer
# 再次检查 tokenizers 库是否可用
try:
if not is_tokenizers_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 若可用,则从 tokenization_retribert_fast 导入 RetriBertTokenizerFast 类
from .tokenization_retribert_fast import RetriBertTokenizerFast
# 再次检查 torch 库是否可用
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 若可用,则从 modeling_retribert 导入相关类和变量
from .modeling_retribert import (
RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
RetriBertModel,
RetriBertPreTrainedModel,
)
# 如果不是类型检查阶段
else:
# 导入 sys 模块
import sys
# 将当前模块替换为延迟加载模块 _LazyModule
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\tapex\tokenization_tapex.py
# 设置文件编码为 UTF-8
# 版权声明和许可条款
# 该文件受 Apache License, Version 2.0 许可,除非符合许可条款,否则不得使用该文件
# 获取完整的许可条款,请访问 http://www.apache.org/licenses/LICENSE-2.0
# 本软件基于"原样"的基础分发,不提供任何明示或暗示的担保或条件
# 更多细节请参阅许可条款
"""TAPEX 的标记类。"""
import json # 导入 json 模块
import os # 导入 os 模块
import random # 导入 random 模块
from functools import lru_cache # 从 functools 模块导入 lru_cache 装饰器
from typing import Dict, List, Optional, Tuple, Union # 导入类型提示模块
import regex as re # 导入 regex 模块作为 re 别名
from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available # 导入文件工具和相关函数
from ....tokenization_utils import AddedToken, PreTrainedTokenizer # 导入 Tokenizer 相关类和函数
from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy # 导入 Tokenizer 基础类和相关功能
from ....utils import logging # 导入日志模块
# 如果可用,导入 pandas 模块
if is_pandas_available():
import pandas as pd
logger = logging.get_logger(__name__) # 获取当前模块的日志记录器实例
# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
# 定义预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/vocab.json",
},
"merges_file": {
"microsoft/tapex-base": "https://huggingface.co/microsoft/tapex-base/resolve/main/merges.txt",
},
}
# 定义预训练模型的位置嵌入大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"microsoft/tapex-base": 512,
}
# 定义预训练模型的初始化配置映射
PRETRAINED_INIT_CONFIGURATION = {
"microsoft/tapex-base": {"do_lower_case": True},
}
class TapexTruncationStrategy(ExplicitEnum):
"""
[`~TapasTokenizer.__call__`] 的 `truncation` 参数的可能取值。在 IDE 中进行代码补全时非常有用。
"""
DROP_ROWS_TO_FIT = "drop_rows_to_fit"
@lru_cache()
def bytes_to_unicode():
"""
返回 utf-8 字节列表及其对应的 unicode 字符映射。我们特别避免映射到空格/控制字符,以免引起 BPE 编码错误。
可逆的 BPE 编码工作在 unicode 字符串上。这意味着如果要避免 UNK 标记,词汇表中需要大量的 unicode 字符。
当处理类似于 10B 令牌的数据集时,您大约需要 5K 个字符才能实现良好的覆盖率。这在常规的 32K BPE 词汇表中占据了相当大的比例。
为了避免这种情况,我们希望在 utf-8 字节和 unicode 字符串之间建立查找表。
"""
bs = (
list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
)
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""
返回单词中的符号对集合。单词被表示为符号元组(符号是可变长度的字符串)。
"""
# 初始化一个空集合,用于存储符号对
pairs = set()
# 获取单词的第一个符号作为前一个符号
prev_char = word[0]
# 遍历单词中除第一个符号外的所有符号
for char in word[1:]:
# 将前一个符号和当前符号作为一个符号对加入到集合中
pairs.add((prev_char, char))
# 更新前一个符号为当前符号,为下一次循环做准备
prev_char = char
# 返回存储了单词中所有符号对的集合
return pairs
class IndexedRowTableLinearize:
"""
FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
"""
def process_table(self, table_content: Dict):
"""
Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
# 检查输入的表格内容中是否包含 "header" 和 "rows" 键,如果不包含则触发断言异常
assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE
# 处理表头,将表头转换成特定格式的字符串
table_str = self.process_header(table_content["header"]) + " "
# 处理每一行数据
for i, row_example in enumerate(table_content["rows"]):
# 注意:行索引从1开始而不是从0开始
table_str += self.process_row(row_example, row_index=i + 1) + " "
# 去除首尾空格并返回处理后的字符串
return table_str.strip()
def process_header(self, headers: List):
"""
Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
# 返回格式化后的表头字符串,格式为 "col : col1 | col2 | col 3"
return "col : " + " | ".join(headers)
def process_row(self, row: List, row_index: int):
"""
Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols.
"""
# 初始化空字符串来存储行的字符串表示
row_str = ""
# 初始化列表来存储每个单元格的值
row_cell_values = []
# 遍历行中的每个单元格的值
for cell_value in row:
# 如果单元格的值是整数,将其转换为字符串后添加到列表中
if isinstance(cell_value, int):
row_cell_values.append(str(cell_value))
else:
# 否则直接将单元格的值添加到列表中
row_cell_values.append(cell_value)
# 将每个单元格的值用 " | " 连接起来,并添加到行字符串表示中
row_str += " | ".join(row_cell_values)
# 返回格式化后的行字符串,格式为 "row 1 : val1 | val2 | val3"
return "row " + str(row_index) + " : " + row_str
class TapexTokenizer(PreTrainedTokenizer):
r"""
Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences
to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following:
sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table
will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated
for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to
the tokenizer for instance to prepare them for the model.
Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
"""
# 定义一个函数,用于初始化和配置词汇和特殊标记的设置
def __init__(
vocab_file: str,
merges_file: str,
do_lower_case: bool = True,
errors: str = "replace",
bos_token: str = "<s>",
eos_token: str = "</s>",
sep_token: str = "</s>",
cls_token: str = "<s>",
unk_token: str = "<unk>",
pad_token: str = "<pad>",
mask_token: str = "<mask>",
add_prefix_space: bool = False,
max_cell_length: int = 15
):
"""
Args:
vocab_file (`str`):
词汇文件的路径。
merges_file (`str`):
合并文件的路径。
do_lower_case (`bool`, *optional*, defaults to `True`):
在分词时是否将输入转换为小写。
errors (`str`, *optional*, defaults to `"replace"`):
解码字节为 UTF-8 时使用的策略。参见 [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) 获取更多信息。
bos_token (`str`, *optional*, defaults to `"<s>"`):
在预训练期间用作序列开头的特殊标记。可用作序列分类器的标记。
<Tip>
当使用特殊标记构建序列时,此处的标记并非序列开头的标记。序列开头的标记是 `cls_token`。
</Tip>
eos_token (`str`, *optional*, defaults to `"</s>"`):
用作序列结尾的特殊标记。
<Tip>
当使用特殊标记构建序列时,此处的标记并非序列结尾的标记。序列结尾的标记是 `sep_token`。
</Tip>
sep_token (`str`, *optional*, defaults to `"</s>"`):
分隔标记,在构建多序列组合序列时使用,例如用于序列分类或问答任务的文本和问题的组合序列。也用作使用特殊标记构建的序列的最后一个标记。
cls_token (`str`, *optional*, defaults to `"<s>"`):
分类器标记,在进行序列分类(整体序列而非逐标记分类)时使用。在使用特殊标记构建序列时是序列的第一个标记。
unk_token (`str`, *optional*, defaults to `"<unk>"`):
未知标记。词汇表中不存在的标记将被设置为此标记。
pad_token (`str`, *optional*, defaults to `"<pad>"`):
用于填充的标记,例如在批处理不同长度序列时使用。
mask_token (`str`, *optional*, defaults to `"<mask>"`):
用于掩码值的标记。在使用掩码语言建模训练模型时使用。模型将尝试预测此标记。
add_prefix_space (`bool`, *optional*, defaults to `False`):
是否在输入的开头添加空格。这允许将开头的单词视为任何其他单词。 (BART 分词器通过前导空格检测单词的开始)。
max_cell_length (`int`, *optional*, defaults to 15):
线性化表格时每个单元格的最大字符数。如果超过此数字,则进行截断。
"""
# 从全局常量中获取词汇表文件名列表
vocab_files_names = VOCAB_FILES_NAMES
# 从全局常量中获取预训练词汇文件映射表
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# 从全局常量中获取预训练位置嵌入的最大模型输入尺寸
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
# 从全局常量中获取预训练初始化配置
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
# 定义模型输入的名称列表
model_input_names = ["input_ids", "attention_mask"]
# 初始化方法,用于创建一个新的实例
def __init__(
self,
vocab_file,
merges_file,
do_lower_case=True,
errors="replace",
bos_token="<s>",
eos_token="</s>",
sep_token="</s>",
cls_token="<s>",
unk_token="<unk>",
pad_token="<pad>",
mask_token="<mask>",
add_prefix_space=False,
max_cell_length=15,
**kwargs,
):
# 如果传入的特殊标记是字符串类型,则将其转换为 AddedToken 对象,保留其前后空格
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
# 将 mask_token 转换为 AddedToken 对象,处理左侧空格但不处理右侧空格
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
# 使用 UTF-8 编码打开词汇文件,并将其加载为编码器字典
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
# 创建解码器字典,键值对颠倒
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # 用于处理解码中的错误
# 创建字节到 Unicode 字符的编码器
self.byte_encoder = bytes_to_unicode()
# 创建字节到 Unicode 字符的解码器,键值对颠倒
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
# 使用 UTF-8 编码打开合并文件,读取 BPE 合并规则并创建 BPE 排序字典
with open(merges_file, encoding="utf-8") as merges_handle:
bpe_merges = merges_handle.read().split("\n")[1:-1]
bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space
self.do_lower_case = do_lower_case
# 编译正则表达式模式,用于识别和分割单词、数字、标点符号及空格
# 添加 re.IGNORECASE 以便能够处理首字母大写的缩略词
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
# 调用父类的初始化方法,传入相关参数
super().__init__(
vocab_file=vocab_file,
merges_file=merges_file,
do_lower_case=do_lower_case,
errors=errors,
bos_token=bos_token,
eos_token=eos_token,
unk_token=unk_token,
sep_token=sep_token,
cls_token=cls_token,
pad_token=pad_token,
mask_token=mask_token,
add_prefix_space=add_prefix_space,
max_cell_length=max_cell_length,
**kwargs,
)
# 设置最大单元长度
self.max_cell_length = max_cell_length
# 初始化表格线性化对象
self.table_linearize = IndexedRowTableLinearize()
# 构建包含特殊标记的输入
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating and
adding special tokens. A TAPEX sequence has the following format:
- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
# If only one sequence is provided, add `<s>` (CLS) token, sequence tokens, and `</s>` (SEP) token
if token_ids_1 is None:
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# For pairs of sequences, concatenate tokens with appropriate special tokens
cls = [self.cls_token_id]
sep = [self.sep_token_id]
return cls + token_ids_0 + sep + sep + token_ids_1 + sep
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
) -> List[int]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
# If the input tokens already have special tokens, delegate to the superclass method
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
)
# If only one sequence is provided, mark special tokens at the beginning and end
if token_ids_1 is None:
return [1] + ([0] * len(token_ids_0)) + [1]
# For pairs of sequences, mark special tokens at the beginning and end of each sequence
return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create token type IDs tensor from a list of token ids. This is used for sequence classification tasks where each
sequence pair gets a different token type ID (0 or 1).
Args:
token_ids_0 (`List[int]`):
List of IDs representing the first sequence.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs representing the second sequence in a pair.
Returns:
`List[int]`: A list of token type IDs where each ID corresponds to a token in the input sequences.
"""
) -> List[int]:
"""
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of zeros.
"""
# 定义分隔符和类别标识符的列表
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# 如果没有第二个序列的 token IDs,返回由零组成的列表
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# 否则返回包含两个序列的 token IDs 的列表,并在适当位置插入分隔符
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
"""
Args:
text (str): The input text to be tokenized.
is_split_into_words (bool): Whether the input text is already split into words.
**kwargs: Additional keyword arguments.
add_prefix_space (bool): Whether to add a prefix space to the text if necessary.
Returns:
tuple: A tuple containing the modified text and remaining keyword arguments.
"""
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
# 如果文本已经分成单词或需要添加前缀空格,并且第一个字符不是空白,则在文本前添加空格
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
text = " " + text
# 返回修改后的文本及其余的关键字参数
return (text, kwargs)
@property
def vocab_size(self):
# 返回编码器中的词汇表大小
return len(self.encoder)
def get_vocab(self):
# 返回编码器和添加的特殊 token 编码器组成的字典
return dict(self.encoder, **self.added_tokens_encoder)
def bpe(self, token):
"""
Args:
token (str): The token to apply BPE encoding.
Returns:
str: The token after BPE encoding.
"""
# 如果 token 已经在缓存中,则直接返回缓存中的结果
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
# 如果没有找到需要合并的 pair,则返回原始 token
if not pairs:
return token
while True:
# 找到在 BPE 词汇表中最小的 bigram
bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
# 如果找到的 bigram 不在 BPE 词汇表中,则停止合并
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
except ValueError:
new_word.extend(word[i:])
break
else:
new_word.extend(word[i:j])
i = j
if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
# 如果合并后的 token 长度为 1,则停止合并
if len(word) == 1:
break
else:
pairs = get_pairs(word)
# 将 tuple 转换为字符串,并将结果缓存起来
word = " ".join(word)
self.cache[token] = word
return word
def _tokenize(self, text):
"""
Tokenize a string using Byte-Pair Encoding (BPE).
Args:
text (str): The input text to tokenize.
Returns:
List[str]: List of tokens after tokenization.
"""
bpe_tokens = []
# 使用正则表达式找到文本中的所有符合规则的 token
for token in re.findall(self.pat, text):
# 将 token 转换为字节编码,并映射成 unicode 字符串,避免 BPE 中的控制 token(例如空格)
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
)
# 对 token 应用 BPE 并分割结果,加入到最终的 token 列表中
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
# 返回经 BPE 处理后的 token 列表
return bpe_tokens
# 使用词汇表将给定的 token(字符串)转换为对应的 id
def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder.get(self.unk_token))
# 使用词汇表将给定的 id(整数)转换为对应的 token(字符串)
def _convert_id_to_token(self, index):
return self.decoder.get(index)
# 将一系列的 token(字符串列表)转换为单个字符串
def convert_tokens_to_string(self, tokens):
text = "".join(tokens)
# 使用 byte_decoder 将 byte 数组转换为 utf-8 编码的字符串
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
# 将词汇表保存到指定目录下的文件中
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
# 如果保存目录不存在,则记录错误信息并返回
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
# 构造词汇表文件路径和合并规则文件路径
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
merge_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)
# 将 encoder 写入词汇表文件
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
index = 0
# 将 BPE merges 写入合并规则文件
with open(merge_file, "w", encoding="utf-8") as writer:
writer.write("#version: 0.2\n")
for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
if index != token_index:
# 若 BPE 合并索引不连续,则记录警告信息
logger.warning(
f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
)
index = token_index
writer.write(" ".join(bpe_tokens) + "\n")
index += 1
return vocab_file, merge_file
# 附加文档字符串装饰器,用于添加编码方法的额外参数文档说明
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
# 定义一个特殊方法 __call__,使对象可以像函数一样被调用
def __call__(
# 参数 table 可以是单个 pandas DataFrame 或 DataFrame 列表
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]] = None,
# 参数 query 可以是单个文本输入或文本输入列表,可选
query: Optional[Union[TextInput, List[TextInput]]] = None,
# 参数 answer 可以是单个答案字符串或答案字符串列表
answer: Union[str, List[str]] = None,
# 是否添加特殊标记,默认为 True
add_special_tokens: bool = True,
# 填充选项,可以是布尔值、字符串或 PaddingStrategy 枚举
padding: Union[bool, str, PaddingStrategy] = False,
# 截断选项,可以是布尔值、字符串或 TruncationStrategy 枚举
truncation: Union[bool, str, TruncationStrategy] = None,
# 最大长度限制,可选
max_length: Optional[int] = None,
# 滑动窗口的步长,默认为 0
stride: int = 0,
# 填充到指定的倍数,默认为 None 不进行填充
pad_to_multiple_of: Optional[int] = None,
# 返回的张量类型,可选
return_tensors: Optional[Union[str, TensorType]] = None,
# 是否返回 token_type_ids
return_token_type_ids: Optional[bool] = None,
# 是否返回 attention_mask
return_attention_mask: Optional[bool] = None,
# 是否返回超出长度的 token
return_overflowing_tokens: bool = False,
# 是否返回特殊 token 的掩码
return_special_tokens_mask: bool = False,
# 是否返回偏移映射
return_offsets_mapping: bool = False,
# 是否返回长度信息
return_length: bool = False,
# 是否启用详细模式,默认为 True
verbose: bool = True,
# 其它可选参数,使用 kwargs 接收
**kwargs,
) -> BatchEncoding:
"""
Main method to tokenize and prepare for the model one or several table-sequence pair(s).
Args:
table (`pd.DataFrame`, `List[pd.DataFrame]`):
Table(s) containing tabular data.
query (`str` or `List[str]`, *optional*):
Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of
sentences must match the number of tables.
answer (`str` or `List[str]`, *optional*):
Optionally, the corresponding answer to the questions as supervision.
"""
# 如果传入了 table 参数,则调用源调用函数处理
if table is not None:
return self.source_call_func(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 如果没有传入 table 参数但传入了 answer 参数,则调用目标调用函数处理
elif answer is not None:
return self.target_call_func(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 如果既没有传入 table 参数也没有传入 answer 参数,则抛出 ValueError 异常
else:
raise ValueError("You need to provide either a `table` or an `answer`.")
# 定义一个方法source_call_func,用于处理文本数据和生成模型输入
def source_call_func(
self,
# 参数table接受一个单个或多个Pandas DataFrame对象作为输入数据表格
table: Union["pd.DataFrame", List["pd.DataFrame"]],
# 可选参数query,接受一个单个或多个文本输入或文本输入列表作为查询条件
query: Optional[Union[TextInput, List[TextInput]]] = None,
# 参数answer,接受一个单个字符串或字符串列表作为答案
answer: Union[str, List[str]] = None,
# 是否添加特殊标记到模型输入中,默认为True
add_special_tokens: bool = True,
# 是否进行填充操作,默认为False,或者可以选择填充方式
padding: Union[bool, str, PaddingStrategy] = False,
# 是否进行截断操作,默认为None,或者可以选择截断方式
truncation: Union[bool, str, TruncationStrategy] = None,
# 最大输入长度限制,默认为None,表示不限制
max_length: Optional[int] = None,
# 滑动窗口的步长,默认为0
stride: int = 0,
# 是否填充到指定的倍数,默认为None,表示不需要填充到倍数
pad_to_multiple_of: Optional[int] = None,
# 返回的张量类型,可以指定为字符串或TensorType对象,默认为None
return_tensors: Optional[Union[str, TensorType]] = None,
# 是否返回token类型ID,默认为None,表示不返回
return_token_type_ids: Optional[bool] = None,
# 是否返回注意力掩码,默认为None,表示不返回
return_attention_mask: Optional[bool] = None,
# 是否返回溢出的token,默认为False,表示不返回
return_overflowing_tokens: bool = False,
# 是否返回特殊token掩码,默认为False,表示不返回
return_special_tokens_mask: bool = False,
# 是否返回偏移映射,默认为False,表示不返回
return_offsets_mapping: bool = False,
# 是否返回序列长度,默认为False,表示不返回
return_length: bool = False,
# 是否显示详细信息,默认为True
verbose: bool = True,
# 其他可选参数,作为kwargs收集
**kwargs,
) -> BatchEncoding:
# Input type checking for clearer error
# Initialize flags for valid input types
valid_table = False
valid_query = False
# Check if the 'table' argument is a pandas DataFrame or a list/tuple of DataFrames
if isinstance(table, pd.DataFrame):
valid_table = True
elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame):
valid_table = True
# Check if the 'query' argument is None or a string, or a list/tuple of strings
if query is None or isinstance(query, str):
valid_query = True
elif isinstance(query, (list, tuple)):
if len(query) == 0 or isinstance(query[0], str):
valid_query = True
# Raise ValueError if 'table' or 'query' does not match expected types
if not valid_table:
raise ValueError(
"table input must of type `pd.DataFrame` (single example), `List[pd.DataFrame]` (batch of examples). "
)
if not valid_query:
raise ValueError("query input must of type `str` (single example), `List[str]` (batch of examples). ")
# Determine if batch processing is required based on the types of 'table' or 'query'
is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple))
# If batch processing is required, call 'batch_encode_plus' method
if is_batched:
return self.batch_encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
# If not batched, call 'encode_plus' method
return self.encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[List[TextInput]] = None,
answer: List[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
"""
# 获取填充和截断策略,以及最大长度,处理参数兼容性问题
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法 `_batch_encode_plus` 进行批量编码
return self._batch_encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 定义一个方法 `_batch_encode_plus`,用于对输入的表格数据进行批量编码处理,并返回编码后的结果对象 BatchEncoding
def _batch_encode_plus(
self,
table: Union["pd.DataFrame", List["pd.DataFrame"]],
query: Optional[List[TextInput]] = None,
answer: Optional[List[str]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
# 如果要求返回偏移映射信息,抛出未实现错误,因为 Python tokenizers 不支持这个功能
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast."
)
if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)):
# 单个表格,多个查询的情况下,为每个查询复制表格数据
table = [table] * len(query)
if isinstance(table, (list, tuple)) and isinstance(query, str):
# 多个表格,单个查询的情况下,为每个表格复制相同的查询
query = [query] * len(table)
# 调用内部方法 `_batch_prepare_for_model` 准备模型输入数据
batch_outputs = self._batch_prepare_for_model(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=return_tensors,
verbose=verbose,
)
# 将处理好的批量输出包装成 BatchEncoding 对象并返回
return BatchEncoding(batch_outputs)
# 添加额外的文档字符串,以补充关于编码参数的说明信息
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
# 定义一个方法 _batch_prepare_for_model,用于为模型批量准备输入数据
self,
# 第一个参数 self 是类方法的隐式参数,指向当前实例对象
table: Union["pd.DataFrame", List["pd.DataFrame"]],
# 参数 table 可以是单个 pandas DataFrame 或 DataFrame 列表,存储输入数据
query: Optional[Union[TextInput, List[TextInput]]] = None,
# 参数 query 是可选的,可以是单个文本输入或文本输入列表,用于查询
answer: Optional[Union[str, List[str]]] = None,
# 参数 answer 是可选的,可以是单个答案字符串或答案字符串列表,用于目标答案
add_special_tokens: bool = True,
# 参数 add_special_tokens 是一个布尔值,指示是否添加特殊标记
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
# 参数 padding_strategy 指定填充策略,默认为不填充
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
# 参数 truncation_strategy 指定截断策略,默认为不截断
max_length: Optional[int] = None,
# 参数 max_length 是可选的,指定最大长度限制
stride: int = 0,
# 参数 stride 是一个整数,指定滑动窗口的步幅
pad_to_multiple_of: Optional[int] = None,
# 参数 pad_to_multiple_of 是可选的,指定填充到的倍数
return_tensors: Optional[str] = None,
# 参数 return_tensors 是可选的,指定返回的张量类型
return_token_type_ids: Optional[bool] = None,
# 参数 return_token_type_ids 是可选的,指示是否返回 token 类型 ID
return_attention_mask: Optional[bool] = None,
# 参数 return_attention_mask 是可选的,指示是否返回注意力掩码
return_overflowing_tokens: bool = False,
# 参数 return_overflowing_tokens 是一个布尔值,指示是否返回溢出的 token
return_special_tokens_mask: bool = False,
# 参数 return_special_tokens_mask 是一个布尔值,指示是否返回特殊 token 掩码
return_length: bool = False,
# 参数 return_length 是一个布尔值,指示是否返回长度信息
verbose: bool = True,
# 参数 verbose 是一个布尔值,指示是否输出详细信息
) -> BatchEncoding:
"""
This method adds special tokens, truncates sequences if overflowing while taking into account the special
tokens and manages a moving window (with user defined stride) for overflowing tokens.
"""
batch_outputs = {} # 初始化一个空字典用于存储批处理输出结果
if answer is None: # 如果未提供答案,则将其初始化为与表格数量相同的 None 列表
answer = [None] * len(table)
for _table, _query, _answer in zip(table, query, answer):
text = self.prepare_table_query(
_table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length
)
if self.do_lower_case: # 如果指定需要小写化文本,则执行小写化操作
text = text.lower()
tokens = self.tokenize(text) # 对文本进行分词处理,生成 token 列表
outputs = self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens), # 将 token 转换为对应的 token ID
add_special_tokens=add_special_tokens, # 是否添加特殊 token
padding=PaddingStrategy.DO_NOT_PAD.value, # 设置不进行填充,批处理中之后进行填充
truncation=truncation_strategy.value, # 设置截断策略
max_length=max_length, # 设置最大长度
stride=stride, # 设置滑动窗口的步长
pad_to_multiple_of=None, # 在批处理中进行填充
return_attention_mask=False, # 在批处理中进行填充
return_token_type_ids=return_token_type_ids, # 返回 token 类型 ID
return_overflowing_tokens=return_overflowing_tokens, # 返回溢出的 token
return_special_tokens_mask=return_special_tokens_mask, # 返回特殊 token 掩码
return_length=return_length, # 返回长度
return_tensors=None, # 最终将整个批次转换为张量
prepend_batch_axis=False, # 不在输出张量中添加批次轴
verbose=verbose, # 是否输出详细信息
)
for key, value in outputs.items(): # 将每个输出的值添加到批处理输出字典中
if key not in batch_outputs: # 如果键不在批处理输出字典中,则将其初始化为空列表
batch_outputs[key] = []
batch_outputs[key].append(value) # 将值添加到对应键的列表中
batch_outputs = self.pad( # 对批处理输出进行填充处理
batch_outputs,
padding=padding_strategy.value, # 设置填充策略
max_length=max_length, # 设置最大长度
pad_to_multiple_of=pad_to_multiple_of, # 设置填充到的倍数
return_attention_mask=return_attention_mask, # 返回注意力掩码
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) # 将填充后的批处理输出转换为 BatchEncoding 类型
return batch_outputs # 返回批处理输出对象
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
def encode(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
"""
Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build
your processing on your own, otherwise refer to `__call__`.
"""
# 调用 `encode_plus` 方法,对输入的表格、查询、答案进行编码处理,返回编码后的结果
encoded_inputs = self.encode_plus(
table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
**kwargs,
)
# 返回编码后的输入序列的 `input_ids`,即标识化后的输入表格数据
return encoded_inputs["input_ids"]
@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
# 获取填充和截断策略,并处理与之相关的参数
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法 `_encode_plus`,进行实际的编码处理,并返回 `BatchEncoding` 对象
return self._encode_plus(
table=table,
query=query,
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 定义私有方法 `_encode_plus`,用于将表格数据、查询和答案编码为模型输入的批处理编码
def _encode_plus(
self,
table: "pd.DataFrame",
query: Optional[TextInput] = None,
answer: Optional[str] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
# 如果请求返回偏移映射,则抛出 NotImplementedError
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
# 准备表格、查询和答案,根据截断和最大长度策略生成文本
text = self.prepare_table_query(
table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length
)
# 如果需要,将文本转换为小写
if self.do_lower_case:
text = text.lower()
# 对文本进行分词处理
tokens = self.tokenize(text)
# 准备模型输入,包括将词汇转换为 IDs、添加特殊令牌、填充和截断等操作
return self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=padding_strategy.value,
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
prepend_batch_axis=True,
return_attention_mask=return_attention_mask,
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
verbose=verbose,
)
def target_call_func(
self,
answer: Union[str, List[str]],
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
The method tokenizes and prepares the answer label for the model.
Args:
answer (`str` or `List[str]`):
Corresponding answer supervision to the queries for training the model.
"""
# 检查 `answer` 是否为批量输入(列表或元组)
is_batched = isinstance(answer, (list, tuple))
# 如果 `answer` 是批量输入,则调用批量编码方法 `target_batch_encode_plus`
if is_batched:
return self.target_batch_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
# 如果 `answer` 不是批量输入,则调用单条编码方法 `target_encode_plus`
else:
return self.target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def target_batch_encode_plus(
self,
answer: List[str],
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Prepare answer strings for the model.
Args:
answer `List[str]`:
Corresponding answer supervision to the queries for training the model.
"""
# 获取填充和截断策略以及相关参数,确保向后兼容性
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法进行编码处理,返回批量编码结果
return self._target_batch_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _target_batch_encode_plus(
self,
answer: List[str],
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
"""
Internal method to perform batch encoding of answers.
Args:
answer `List[str]`:
List of answer strings to encode.
add_special_tokens `bool`:
Whether to add special tokens.
padding_strategy `PaddingStrategy`:
Strategy for padding sequences.
truncation_strategy `TruncationStrategy`:
Strategy for truncating sequences.
max_length `Optional[int]`:
Maximum length of the sequences.
stride `int`:
Stride for tokenization.
pad_to_multiple_of `Optional[int]`:
Pad to a multiple of this value.
return_tensors `Optional[Union[str, TensorType]]`:
Optionally return tensors.
return_token_type_ids `Optional[bool]`:
Whether to return token type IDs.
return_attention_mask `Optional[bool]`:
Whether to return attention masks.
return_overflowing_tokens `bool`:
Whether to return overflowing tokens.
return_special_tokens_mask `bool`:
Whether to return special tokens mask.
return_offsets_mapping `bool`:
Whether to return offsets mapping.
return_length `bool`:
Whether to return sequence lengths.
verbose `bool`:
Whether to print verbose information.
**kwargs:
Additional keyword arguments.
Returns:
`BatchEncoding`: Batch encoding containing encoded answers.
"""
# 实现批量编码处理的具体逻辑
# 这里会包括对答案进行标记化、填充和截断等操作
# 返回最终的批量编码结果
pass
) -> BatchEncoding:
# 初始化一个空的批次输出字典
batch_outputs = {}
# 遍历每个答案文本
for text in answer:
# 如果设定为小写处理,则将文本转换为小写
if self.do_lower_case:
text = text.lower()
# 对文本进行分词处理
tokens = self.tokenize(text)
# 准备模型输入,包括将分词转换为 ID,设定特殊标记的添加策略,
# 设定截断策略、最大长度、步长等参数
outputs = self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens),
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD.value, # 在后续批次中进行填充
truncation=truncation_strategy.value,
max_length=max_length,
stride=stride,
pad_to_multiple_of=None, # 在后续批次中进行填充
return_attention_mask=False, # 在后续批次中进行填充
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None, # 在最后将整个批次转换为张量
prepend_batch_axis=False,
verbose=verbose,
)
# 将输出结果添加到批次输出字典中
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
# 对批次输出进行填充处理
batch_outputs = self.pad(
batch_outputs,
padding=padding_strategy.value,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
# 将填充后的输出转换为 BatchEncoding 类型
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
# 返回 BatchEncoding 对象
return BatchEncoding(batch_outputs)
def target_encode(
self,
answer: str,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
max_length: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs,
) -> List[int]:
"""
Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
which are necessary for the model to work correctly. Use this method if you want to build your processing on
your own, otherwise refer to `__call__`.
Args:
answer `str`:
Corresponding answer supervision to the queries for training the model
"""
# 对答案字符串进行编码,返回不包含其他信息(如 token type IDs、attention masks 等)的输出
encoded_outputs = self.target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
return_tensors=return_tensors,
**kwargs,
)
# 返回编码后的输入 ID 列表
return encoded_outputs["input_ids"]
def target_encode_plus(
self,
answer: str,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str] = None,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:
"""
Prepare a answer string for the model.
Args:
answer `str`:
Corresponding answer supervision to the queries for training the model.
"""
# 获取填充和截断策略,并处理兼容性问题
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
# 调用内部方法 `_target_encode_plus`,进行编码处理
return self._target_encode_plus(
answer=answer,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _target_encode_plus(
self,
answer: str,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
):
# 在内部方法中执行实际的编码处理,具体处理细节略去
pass
) -> BatchEncoding:
# 如果需要返回偏移映射,则抛出未实现的错误,Python tokenizers 不支持此功能
if return_offsets_mapping:
raise NotImplementedError(
"return_offset_mapping is not available when using Python tokenizers. "
"To use this feature, change your tokenizer to one deriving from "
"transformers.PreTrainedTokenizerFast. "
"More information on available tokenizers at "
"https://github.com/huggingface/transformers/pull/2674"
)
# 将答案文本赋给变量 text
text = answer
# 如果需要进行小写处理
if self.do_lower_case:
# 将文本转换为小写
text = text.lower()
# 对文本进行分词处理,得到 tokens
tokens = self.tokenize(text)
# 准备模型输入
return self.prepare_for_model(
ids=self.convert_tokens_to_ids(tokens), # 将 tokens 转换为对应的 token IDs
add_special_tokens=add_special_tokens, # 是否添加特殊 tokens
padding=padding_strategy.value, # 填充策略
truncation=truncation_strategy.value, # 截断策略
max_length=max_length, # 最大长度限制
stride=stride, # 步长
pad_to_multiple_of=pad_to_multiple_of, # 填充至某个倍数长度
return_tensors=return_tensors, # 返回的张量类型
prepend_batch_axis=True, # 是否添加批处理维度
return_attention_mask=return_attention_mask, # 是否返回注意力掩码
return_token_type_ids=return_token_type_ids, # 是否返回 token 类型 IDs
return_overflowing_tokens=return_overflowing_tokens, # 是否返回溢出 tokens
return_special_tokens_mask=return_special_tokens_mask, # 是否返回特殊 tokens 掩码
return_length=return_length, # 是否返回长度
verbose=verbose, # 是否详细输出
)
):
"""
This method can be used to linearize a table and add a corresponding query.
Optionally, it also handles truncation of the table (cells).
An answer can be provided for more precise truncation.
"""
if not table.empty:
# step 1: create table dictionary
# 将表格内容转换为包含表头和行数据的字典
table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]}
# step 2: modify table internally
# always truncate table cells based on self.max_cell_length
# optionally truncate rows if truncation_strategy is set to it
# 根据 self.max_cell_length 截断表格单元格,根据截断策略处理行
self.truncate_table_cells(table_content, query, answer)
if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT:
self.truncate_table_rows(table_content, query, answer, max_length=max_length)
# step 3: linearize table
# 线性化表格数据
linear_table = self.table_linearize.process_table(table_content)
else:
linear_table = ""
if linear_table == "":
logger.warning(
"You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). "
+ f"Please carefully check the corresponding table with the query : {query}."
)
if query == "":
logger.warning("You provide nothing to query with respect to the table.")
# step 4: concatenate query with linear_table
# 拼接查询和线性化后的表格数据
separator = " " if query and linear_table else ""
joint_input = (query + separator + linear_table) if query else linear_table
return joint_input
def truncate_table_cells(self, table_content: Dict, question: str, answer: List):
# TODO (Qian): is it possible to revert the original cell if it is in the final answer?
# 截断表格单元格,并记录截断前后的映射关系
cell_mapping = {}
for row in table_content["rows"]:
for i, cell in enumerate(row):
truncate_cell = self.truncate_cell(cell)
if truncate_cell is not None:
cell_mapping[cell] = truncate_cell
row[i] = truncate_cell
# modify the answer list
# 修改答案列表,如果答案中有映射到截断后的单元格,则更新为截断后的值
if answer is not None:
for i, case in enumerate(answer):
if case in cell_mapping.keys():
answer[i] = cell_mapping[case]
def truncate_cell(self, cell_value):
# do not process on these cases
# 如果单元格的值是整数或浮点数,则直接返回
if isinstance(cell_value, int) or isinstance(cell_value, float):
return cell_value
# 如果单元格值不为空白,则尝试分词并根据 self.max_cell_length 截断
if cell_value.strip() != "":
try_tokens = self.tokenize(cell_value)
if len(try_tokens) >= self.max_cell_length:
retain_tokens = try_tokens[: self.max_cell_length]
retain_cell_value = self.convert_tokens_to_string(retain_tokens)
return retain_cell_value
else:
return None
else:
return cell_value
def truncate_table_rows(
self, table_content: Dict, question: str, answer: Optional[Union[str, List[str]]] = None, max_length=None
):
"""
Args:
table_content:
{"header": xxx, "rows": xxx, "id" (Optionally): xxx}
question:
natural language sentence
answer:
if for training, is the supervision; otherwise will be empty
"""
# 估计删除比例和剩余可容纳的标记长度
delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length)
# 随机删除不相关的行
self.delete_unrelated_rows(table_content, question, answer, delete_ratio)
# 保证结果长度小于 max_length
maximum_keep_rows = 0
for ind, row_example in enumerate(table_content["rows"]):
# 处理表格行数据并计算其标记长度
value_string = self.table_linearize.process_row(row_example, ind + 1)
value_token_len = len(self.tokenize(value_string))
# 如果超过最大长度限制,则停止处理
if value_token_len > remain_token_len:
break
# 更新剩余标记长度并增加保留的行数
remain_token_len -= value_token_len
maximum_keep_rows += 1
# 删除超出最大长度限制的行
del table_content["rows"][maximum_keep_rows:]
def estimate_delete_ratio(self, table_content: Dict, question: str, max_length=None):
# 检查表格内容是否包含必需的 'header' 和 'rows' 键
if "header" not in table_content or "rows" not in table_content:
raise ValueError("The table content should contain both 'header' and 'rows' keys.")
# 计算问题的标记数(包括特殊标记)
question_tokens = self.tokenize(question, add_special_tokens=True)
# 处理表头并计算其标记数(不包括特殊标记)
header_string = self.table_linearize.process_header(table_content["header"])
header_tokens = self.tokenize(header_string, add_special_tokens=False)
# 计算问题和表头的总标记数
used_token_len = len(question_tokens) + len(header_tokens)
# 计算剩余的标记空间用于行数据
remain_token_len = max_length - used_token_len
# 计算表格所有行的标记数以粗略估计总长度
value_string = ""
for _, row_example in enumerate(table_content["rows"]):
value_string += self.table_linearize.process_row(row_example, 100) + " "
value_token_len = len(self.tokenize(value_string))
# 如果总标记数小于剩余的标记空间,则不需要删除行
if value_token_len < remain_token_len:
return 0.0, remain_token_len
else:
# 计算大致的删除比例
return 1.0 - remain_token_len / value_token_len, remain_token_len
# 定义一个方法,用于删除与给定问题和答案不相关的表格行
def delete_unrelated_rows(self, table_content: Dict, question: str, answer: List, delete_ratio: float):
"""
The argument answer is used only during training.
参数 answer 仅在训练过程中使用。
"""
# 用于存储被截断的不相关行的索引列表
truncated_unrelated_indices = []
# 用于存储相关行的索引列表
related_indices = []
# 如果 answer 为 None 或者空列表,则创建一个空的答案集合
if answer is None or len(answer) == 0:
answer_set = set()
else:
# 将答案列表中的每个答案转换为小写并添加到答案集合中
answer_set = {ans_ex.lower() for ans_ex in answer}
# 如果存在问题,将问题分割成单词并添加到答案集合中
if question is not None:
answer_set.update(question.split())
# 将问题去除标点符号后分割成单词并存储为问题集合
question_set = set(question.strip("?!.,").split(" "))
# 计算表格内容中行的最大长度
row_max_len = len(table_content["rows"])
# 遍历表格内容中的每一行
for _row_idx, row in enumerate(table_content["rows"]):
# 将当前行中每个单元格的值转换为小写,并存储为集合
lower_row = {str(cell).lower() for cell in row}
# 如果当前行既不包含答案集合中的任何单词,也不包含问题集合中的任何单词,则将其索引添加到截断不相关行的索引列表中
if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
truncated_unrelated_indices.append(_row_idx)
else:
# 如果当前行包含答案集合或问题集合中的单词,则将当前行索引及其前后两行的索引添加到相关行的索引列表中
related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2])
# 从截断的不相关行索引列表中移除相关行的索引
truncated_unrelated_indices = [
_row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices
]
# 计算要删除的行数,最小为截断不相关行索引列表的长度和总行数乘以删除比例的整数部分
drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio))
# 从截断不相关行索引列表中随机选择要删除的行数,并存储为删除行的索引列表
drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items)
# 倒序遍历表格内容中的行索引
for _row_idx in reversed(range(row_max_len)):
# 如果当前行索引在删除行索引列表中,则从表格内容中删除该行
if _row_idx in drop_row_indices:
del table_content["rows"][_row_idx]
# 如果表格内容中包含 ID 并且删除的行数大于 0,则记录警告日志
if "id" in table_content and len(drop_row_indices) > 0:
logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"]))
.\models\deprecated\tapex\__init__.py
# 导入必要的类型检查模块
from typing import TYPE_CHECKING
# 导入延迟加载模块
from ....utils import _LazyModule
# 定义模块的导入结构,指定了 tokenization_tapex 模块中的 TapexTokenizer 类
_import_structure = {"tokenization_tapex": ["TapexTokenizer"]}
# 如果当前是类型检查阶段
if TYPE_CHECKING:
# 导入 TapexTokenizer 类型,用于类型检查
from .tokenization_tapex import TapexTokenizer
# 如果不是类型检查阶段(即运行时)
else:
# 导入 sys 模块,用于后续模块替换操作
import sys
# 通过动态设置 sys.modules,将当前模块替换为延迟加载的 _LazyModule 对象
# 这样可以延迟导入实际的模块内容,直到真正需要使用时才加载
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
# 设置文件编码为UTF-8
# 版权声明,声明此代码版权归Trajectory Transformers论文作者和HuggingFace Inc.团队所有
#
# 根据Apache许可证2.0版授权,除非符合许可证要求,否则不得使用此文件
# 您可以在以下网址获取许可证副本:http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则本软件是基于"按原样"分发的,不提供任何明示或暗示的担保或条件。
# 有关特定语言的详细信息,请参阅许可证。
""" TrajectoryTransformer模型配置"""
# 导入必要的配置类和日志记录工具
from ....configuration_utils import PretrainedConfig
from ....utils import logging
# 获取全局日志记录器实例
logger = logging.get_logger(__name__)
# 定义预训练模型配置文件映射字典
TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"CarlCochet/trajectory-transformer-halfcheetah-medium-v2": (
"https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2/resolve/main/config.json"
),
# 查看所有TrajectoryTransformer模型请访问 https://huggingface.co/models?filter=trajectory_transformer
}
class TrajectoryTransformerConfig(PretrainedConfig):
r"""
这是一个配置类,用于存储[`TrajectoryTransformerModel`]的配置。根据指定的参数实例化TrajectoryTransformer模型,
定义模型架构。使用默认参数实例化一个配置将产生类似于TrajectoryTransformer
[CarlCochet/trajectory-transformer-halfcheetah-medium-v2](https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2)
架构的配置。
配置对象继承自[`PretrainedConfig`],可用于控制模型的输出。有关更多信息,请阅读[`PretrainedConfig`]的文档。
```
>>> from transformers import TrajectoryTransformerConfig, TrajectoryTransformerModel
>>> # 初始化一个TrajectoryTransformer模型,以CarlCochet/trajectory-transformer-halfcheetah-medium-v2风格的配置
>>> configuration = TrajectoryTransformerConfig()
>>> # 使用随机权重从CarlCochet/trajectory-transformer-halfcheetah-medium-v2风格的配置初始化一个模型
>>> model = TrajectoryTransformerModel(configuration)
>>> # 访问模型的配置
>>> configuration = model.config
```
"""
# 模型类型为trajectory_transformer
model_type = "trajectory_transformer"
# 推断过程中需要忽略的键列表
keys_to_ignore_at_inference = ["past_key_values"]
# 属性映射字典,将配置属性映射到模型中的实际名称
attribute_map = {
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size=100, # 初始化函数,设定类的初始属性值,其中包括词汇表大小,默认为100
action_weight=5, # 动作权重,默认为5
reward_weight=1, # 奖励权重,默认为1
value_weight=1, # 值权重,默认为1
block_size=249, # 块大小,默认为249
action_dim=6, # 动作维度,默认为6
observation_dim=17, # 观察维度,默认为17
transition_dim=25, # 过渡维度,默认为25
n_layer=4, # 层数,默认为4
n_head=4, # 头数,默认为4
n_embd=128, # 嵌入维度,默认为128
embd_pdrop=0.1, # 嵌入层dropout率,默认为0.1
attn_pdrop=0.1, # 注意力dropout率,默认为0.1
resid_pdrop=0.1, # 残差连接dropout率,默认为0.1
learning_rate=0.0006, # 学习率,默认为0.0006
max_position_embeddings=512, # 最大位置嵌入数,默认为512
initializer_range=0.02, # 初始化范围,默认为0.02
layer_norm_eps=1e-12, # 层归一化epsilon,默认为1e-12
kaiming_initializer_range=1, # Kaiming初始化范围,默认为1
use_cache=True, # 是否使用缓存,默认为True
pad_token_id=1, # 填充标记ID,默认为1
bos_token_id=50256, # 开始标记ID,默认为50256
eos_token_id=50256, # 结束标记ID,默认为50256
**kwargs, # 其他关键字参数
):
self.vocab_size = vocab_size # 设置对象属性:词汇表大小
self.action_weight = action_weight # 设置对象属性:动作权重
self.reward_weight = reward_weight # 设置对象属性:奖励权重
self.value_weight = value_weight # 设置对象属性:值权重
self.max_position_embeddings = max_position_embeddings # 设置对象属性:最大位置嵌入数
self.block_size = block_size # 设置对象属性:块大小
self.action_dim = action_dim # 设置对象属性:动作维度
self.observation_dim = observation_dim # 设置对象属性:观察维度
self.transition_dim = transition_dim # 设置对象属性:过渡维度
self.learning_rate = learning_rate # 设置对象属性:学习率
self.n_layer = n_layer # 设置对象属性:层数
self.n_head = n_head # 设置对象属性:头数
self.n_embd = n_embd # 设置对象属性:嵌入维度
self.embd_pdrop = embd_pdrop # 设置对象属性:嵌入层dropout率
self.attn_pdrop = attn_pdrop # 设置对象属性:注意力dropout率
self.resid_pdrop = resid_pdrop # 设置对象属性:残差连接dropout率
self.initializer_range = initializer_range # 设置对象属性:初始化范围
self.layer_norm_eps = layer_norm_eps # 设置对象属性:层归一化epsilon
self.kaiming_initializer_range = kaiming_initializer_range # 设置对象属性:Kaiming初始化范围
self.use_cache = use_cache # 设置对象属性:是否使用缓存
super().__init__( # 调用父类构造函数,传递填充、开始和结束标记ID以及其他关键字参数
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
)
# coding=utf-8
# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TrajectoryTransformer pytorch checkpoint conversion"""
import torch
import trajectory.utils as utils
from transformers import TrajectoryTransformerModel
class Parser(utils.Parser):
dataset: str = "halfcheetah-medium-expert-v2" # 设置默认数据集名称为 'halfcheetah-medium-expert-v2'
config: str = "config.offline" # 设置默认配置文件名称为 'config.offline'
def convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(logbase, dataset, loadpath, epoch, device):
"""Converting Sequential blocks to ModuleList"""
# 调用外部函数加载模型 gpt,并指定使用的设备和加载的 epoch
gpt, gpt_epoch = utils.load_model(logbase, dataset, loadpath, epoch=epoch, device=device)
# 根据 gpt 的配置创建 TrajectoryTransformerModel 实例
trajectory_transformer = TrajectoryTransformerModel(gpt.config)
# 将 gpt 的 token embedding 的状态字典加载到 trajectory_transformer 中
trajectory_transformer.tok_emb.load_state_dict(gpt.tok_emb.state_dict())
# 直接赋值 gpt 的 position embedding 到 trajectory_transformer 的 position embedding
trajectory_transformer.pos_emb = gpt.pos_emb
# 将 gpt 的 dropout 层的状态字典加载到 trajectory_transformer 中
trajectory_transformer.drop.load_state_dict(gpt.drop.state_dict())
# 将 gpt 的 layer normalization 层的状态字典加载到 trajectory_transformer 中
trajectory_transformer.ln_f.load_state_dict(gpt.ln_f.state_dict())
# 将 gpt 的 head 层的状态字典加载到 trajectory_transformer 中
trajectory_transformer.head.load_state_dict(gpt.head.state_dict())
# 遍历 gpt 的各个 block
for i, block in enumerate(gpt.blocks):
# 将 gpt 的第 i 个 block 的 layer normalization 层的状态字典加载到 trajectory_transformer 的第 i 个 block 中
trajectory_transformer.blocks[i].ln1.load_state_dict(gpt.blocks[i].ln1.state_dict())
trajectory_transformer.blocks[i].ln2.load_state_dict(gpt.blocks[i].ln2.state_dict())
# 将 gpt 的第 i 个 block 的 attention 层的状态字典加载到 trajectory_transformer 的第 i 个 block 中
trajectory_transformer.blocks[i].attn.load_state_dict(gpt.blocks[i].attn.state_dict())
# 将 gpt 的第 i 个 block 的 MLP 的各层状态字典加载到 trajectory_transformer 的第 i 个 block 中
trajectory_transformer.blocks[i].l1.load_state_dict(gpt.blocks[i].mlp[0].state_dict())
trajectory_transformer.blocks[i].act.load_state_dict(gpt.blocks[i].mlp[1].state_dict())
trajectory_transformer.blocks[i].l2.load_state_dict(gpt.blocks[i].mlp[2].state_dict())
trajectory_transformer.blocks[i].drop.load_state_dict(gpt.blocks[i].mlp[3].state_dict())
# 将转换后的模型的状态字典保存为 pytorch_model.bin 文件
torch.save(trajectory_transformer.state_dict(), "pytorch_model.bin")
if __name__ == "__main__":
"""
To run this script you will need to install the original repository to run the original model. You can find it
here: https://github.com/jannerm/trajectory-transformer From this repository code you can also download the
original pytorch checkpoints.
Run with the command:
```
>>> python convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py --dataset <dataset_name>
... --gpt_loadpath <path_to_original_pytorch_checkpoint>
```
"""
# 使用 Parser 类解析命令行参数并设置默认参数值
args = Parser().parse_args("plan")
# 调用函数 convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch,转换原始 PyTorch 检查点到 PyTorch 格式
# 使用参数 args.logbase 作为日志基础路径
# 使用参数 args.dataset 指定数据集
# 使用参数 args.gpt_loadpath 指定 GPT 模型加载路径
# 使用参数 args.gpt_epoch 指定 GPT 模型的 epoch 数
# 使用参数 args.device 指定设备(如 GPU 或 CPU)
convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch(
args.logbase, args.dataset, args.gpt_loadpath, args.gpt_epoch, args.device
)
# coding=utf-8
# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch TrajectoryTransformer model."""
import math
import os
from dataclasses import dataclass
from typing import Optional, Tuple, Union
import numpy as np
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import functional as F
from ....modeling_utils import PreTrainedModel
from ....utils import (
ModelOutput,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_trajectory_transformer import TrajectoryTransformerConfig
# 获取日志记录器
logger = logging.get_logger(__name__)
# 用于文档的检查点和配置
_CHECKPOINT_FOR_DOC = "CarlCochet/trajectory-transformer-halfcheetah-medium-v2"
_CONFIG_FOR_DOC = "TrajectoryTransformerConfig"
# 预训练模型存档列表
TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
"CarlCochet/trajectory-transformer-halfcheetah-medium-v2",
# See all TrajectoryTransformer models at https://huggingface.co/models?filter=trajectory_transformer
]
# 从 TensorFlow 模型加载权重到 PyTorch 模型
def load_tf_weights_in_trajectory_transformer(model, config, tf_checkpoint_path):
"""Load tf checkpoints in a pytorch model."""
try:
import re
import numpy as np
import tensorflow as tf
except ImportError:
logger.error(
"Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions."
)
raise
# 获取 TensorFlow 检查点文件的绝对路径
tf_path = os.path.abspath(tf_checkpoint_path)
logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
# 从 TF 模型加载权重
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
logger.info(f"Loading TF weight {name} with shape {shape}")
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
# 遍历 names 和 arrays 中的每个元素,其中 names 是名称列表,arrays 是数组列表
for name, array in zip(names, arrays):
# 将 name 按 "/" 分割成列表
name = name.split("/")
# 检查是否包含特定的名称,如果是,则跳过当前循环
# 这些名称通常是不需要用于预训练模型的变量
if any(
n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
for n in name
):
logger.info(f"Skipping {'/'.join(name)}")
continue
# 初始化指针为模型本身
pointer = model
# 遍历 name 中的每个名称部分
for m_name in name:
# 如果名称符合形如 "xxx_0" 的模式,则按下划线拆分
if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
scope_names = re.split(r"_(\d+)", m_name)
else:
scope_names = [m_name]
# 根据第一个名称部分确定要设置的指针位置
if scope_names[0] == "kernel" or scope_names[0] == "gamma":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
pointer = getattr(pointer, "bias")
elif scope_names[0] == "output_weights":
pointer = getattr(pointer, "weight")
elif scope_names[0] == "squad":
pointer = getattr(pointer, "classifier")
else:
try:
pointer = getattr(pointer, scope_names[0])
except AttributeError:
# 如果找不到相应的属性,记录日志并跳过当前循环
logger.info(f"Skipping {'/'.join(name)}")
continue
# 如果名称部分超过一个,则按索引号更新指针位置
if len(scope_names) >= 2:
num = int(scope_names[1])
pointer = pointer[num]
# 如果最后一个名称部分是 "_embeddings",则将指针定位到权重属性
if m_name[-11:] == "_embeddings":
pointer = getattr(pointer, "weight")
elif m_name == "kernel":
# 如果名称是 "kernel",则对数组进行转置操作
array = np.transpose(array)
# 检查指针和数组的形状是否匹配,如果不匹配,则引发 ValueError 异常
try:
if pointer.shape != array.shape:
raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
except AssertionError as e:
# 如果发生断言错误,则将错误信息更新后重新引发
e.args += (pointer.shape, array.shape)
raise
# 记录日志,指示正在初始化 PyTorch 权重
logger.info(f"Initialize PyTorch weight {name}")
# 将 numpy 数组转换为 PyTorch 张量,并赋值给指针的数据属性
pointer.data = torch.from_numpy(array)
# 返回更新后的模型
return model
# 使用 @dataclass 装饰器定义一个数据类,表示轨迹转换器的模型输出。
@dataclass
class TrajectoryTransformerOutput(ModelOutput):
"""
Base class for model's outputs that also contains a pooling of the last hidden states.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss. 语言建模损失(可选),在提供标签时返回。
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). 语言建模头的预测分数(SoftMax 之前每个词汇标记的分数)。
past_key_values (`Tuple[Tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the
attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
长度为 `config.n_layers` 的元组,包含形状为 `(batch_size, num_heads, sequence_length, embed_size_per_head)` 的张量元组。
包含预先计算的隐藏状态(注意力块中的键和值),可用于加速顺序解码。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
plus the initial embedding outputs.
包含模型每一层输出的隐藏状态的元组(每层一个),以及初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`. GPT2Attentions weights after the attention softmax, used to compute the weighted average
in the self-attention heads.
包含注意力权重的元组(每层一个),用于计算自注意力头中的加权平均值。
"""
# 定义可选的损失值,类型为 torch.FloatTensor,形状为 `(1,)`
loss: Optional[torch.FloatTensor] = None
# 定义预测的 logits,类型为 torch.FloatTensor,形状为 `(batch_size, sequence_length, config.vocab_size)`
logits: torch.FloatTensor = None
# 定义过去键值对的元组,类型为 `Tuple[Tuple[torch.FloatTensor]]`
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
# 定义隐藏状态的元组,类型为 `tuple(torch.FloatTensor)`
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 定义注意力权重的元组,类型为 `tuple(torch.FloatTensor)`
attentions: Optional[Tuple[torch.FloatTensor]] = None
# 定义 TrajectoryTransformerPreTrainedModel 类,继承自 PreTrainedModel,用于处理权重初始化、预训练模型的下载和加载的抽象类。
class TrajectoryTransformerPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 设置配置类为 TrajectoryTransformerConfig
config_class = TrajectoryTransformerConfig
# 设置加载 TensorFlow 权重的函数为 load_tf_weights_in_trajectory_transformer
load_tf_weights = load_tf_weights_in_trajectory_transformer
# 设置基础模型前缀为 "trajectory_transformer"
base_model_prefix = "trajectory_transformer"
# 设置主输入名称为 "trajectories"
main_input_name = "trajectories"
# 支持梯度检查点
supports_gradient_checkpointing = True
# 定义一个初始化权重的方法,用于初始化模块的参数
def _init_weights(self, module):
# 如果模块是线性层或嵌入层
if isinstance(module, (nn.Linear, nn.Embedding)):
# 使用正态分布初始化权重,均值为0,标准差为配置文件中指定的范围
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果是线性层,并且存在偏置,则将偏置初始化为零
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
# 如果模块是 LayerNorm 层
elif isinstance(module, nn.LayerNorm):
# 将偏置初始化为零
module.bias.data.zero_()
# 将权重初始化为全1
module.weight.data.fill_(1.0)
# 如果模块是 EinLinear 类型
elif isinstance(module, EinLinear):
# 针对每个模型,使用 Kaiming 均匀初始化权重
for i in range(module.n_models):
nn.init.kaiming_uniform_(module.weight[i], a=math.sqrt(5) / self.config.kaiming_initializer_range)
# 如果存在偏置,则根据 fan-in 计算初始化范围,并使用均匀分布初始化偏置
if module.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight[i])
bound = (1 / math.sqrt(fan_in)) * self.config.initializer_range
nn.init.uniform_(module.bias[i], -bound, bound)
# 定义模型文档字符串,描述了这个 PyTorch 模型类的基本信息和参数使用说明
TRAJECTORY_TRANSFORMER_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`TrajectoryTransformerConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# 定义输入文档字符串,详细描述了模型类的输入参数及其含义
TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING = r"""
Args:
trajectories (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Batch of trajectories, where a trajectory is a sequence of states, actions and rewards.
past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`, *optional*):
Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
`past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
their past given to this model should not be passed as `input_ids` as they have already been computed.
targets (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Desired targets used to compute the loss.
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
def __init__(self, n_models, in_features, out_features, bias):
super().__init__()
self.n_models = n_models
self.out_features = out_features
self.in_features = in_features
self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(n_models, out_features))
else:
self.register_parameter("bias", None)
# 初始化函数,设置模型的参数和权重
super().__init__() # 调用父类的初始化函数
self.n_models = n_models # 设置模型中的子模型数量
self.out_features = out_features # 设置输出特征的数量
self.in_features = in_features # 设置输入特征的数量
self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features)) # 初始化权重参数
if bias:
self.bias = nn.Parameter(torch.Tensor(n_models, out_features)) # 如果有偏置,则初始化偏置参数
else:
self.register_parameter("bias", None) # 如果没有偏置,则注册一个空的偏置参数
def reset_parameters(self):
for i in range(self.n_models):
nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i])
bound = 1 / math.sqrt(fan_in)
nn.init.uniform_(self.bias[i], -bound, bound)
# 重置模型参数的函数
for i in range(self.n_models):
nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5)) # 使用 Kaiming 均匀初始化权重
if self.bias is not None:
fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i])
bound = 1 / math.sqrt(fan_in)
nn.init.uniform_(self.bias[i], -bound, bound) # 使用均匀分布初始化偏置
def forward(self, input):
"""
Args:
input (`torch.FloatTensor` of shape `(B, n_models, input_dim)`):
The input to the layer.
"""
# [ batch_size x n_models x output_dim ]
output = torch.einsum("eoi,bei->beo", self.weight, input) # 执行张量乘法运算
if self.bias is not None:
raise RuntimeError() # 如果偏置不为空,则引发运行时错误
return output
# 前向传播函数,计算输入的张量乘法结果
output = torch.einsum("eoi,bei->beo", self.weight, input) # 使用 Einstein Summation 进行张量乘法运算
if self.bias is not None:
raise RuntimeError() # 如果偏置不为空,则引发运行时错误
return output # 返回计算结果的输出张量
class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.n_embd % config.n_head != 0:
raise ValueError(f"n_head ({config.n_head}) should be a divisor of n_embd ({config.n_embd})")
# key, query, value projections for all heads
# 为所有注意力头创建 key、query、value 的投影层
self.key = nn.Linear(config.n_embd, config.n_embd)
self.query = nn.Linear(config.n_embd, config.n_embd)
self.value = nn.Linear(config.n_embd, config.n_embd)
# regularization
# 注意力机制的正则化
self.attn_drop = nn.Dropout(config.attn_pdrop)
self.resid_drop = nn.Dropout(config.resid_pdrop)
# output projection
# 输出层的投影
self.proj = nn.Linear(config.n_embd, config.n_embd)
# causal mask to ensure that attention is only applied to the left in the input sequence
# 因果掩码确保注意力仅应用于输入序列的左侧
self.register_buffer(
"mask",
torch.tril(torch.ones(config.block_size, config.block_size)).view(
1, 1, config.block_size, config.block_size
),
persistent=False,
)
# mask previous value estimates
# 屏蔽先前的值估计
joined_dim = config.observation_dim + config.action_dim + 2
self.mask.squeeze()[:, joined_dim - 1 :: joined_dim] = 0
self.n_head = config.n_head
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
):
# 获取隐藏状态张量的尺寸信息:批量大小、序列长度、嵌入维度
batch_size, sequence_length, embedding_dim = hidden_states.size()
# 计算每个头部的查询、键、值,并将头部维度移动到批量维度之前
# [ batch_size x n_heads x sequence_length x head_dim ]
key = (
self.key(hidden_states)
.view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
.transpose(1, 2)
)
query = (
self.query(hidden_states)
.view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
.transpose(1, 2)
)
value = (
self.value(hidden_states)
.view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
.transpose(1, 2)
)
if layer_past is not None:
past_key, past_value = layer_past
# 将过去的键和值与当前计算得到的键和值拼接起来
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)
if use_cache is True:
# 如果需要使用缓存,将当前的键和值存储为当前状态
present = (key, value)
else:
present = None
# 自回归自注意力机制
# [ batch_size x n_heads x sequence_length x sequence_length ]
attn_weights = (torch.matmul(query, key.transpose(-2, -1))) * (1.0 / math.sqrt(key.size(-1)))
# 掩盖填充部分,防止注意力机制关注填充位置
attn_weights = attn_weights.masked_fill(
self.mask[:, :, :sequence_length, :sequence_length] == 0, torch.finfo(attn_weights.dtype).min
)
# 对注意力权重进行归一化
attn_weights = F.softmax(attn_weights, dim=-1)
# 保存注意力权重映射,便于后续分析或可视化
self._attn_map = attn_weights.clone()
# 对注意力权重应用 dropout
attn_weights = self.attn_drop(attn_weights)
output = torch.matmul(attn_weights, value)
# [ batch_size x sequence_length x embedding_dim ]
# 将所有头部的输出重新组合在一起
output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, embedding_dim)
# 输出投影层处理
output = self.resid_drop(self.proj(output))
outputs = (output, present)
if output_attentions:
outputs += (attn_weights,)
return outputs
# 定义一个名为 Block 的神经网络模块,继承自 nn.Module 类
class Block(nn.Module):
# 初始化函数,接受一个 config 参数
def __init__(self, config):
super().__init__()
# 第一个 Layer Normalization 层,标准化输入的 embedding 维度
self.ln1 = nn.LayerNorm(config.n_embd)
# 第二个 Layer Normalization 层,标准化注意力输出的 embedding 维度
self.ln2 = nn.LayerNorm(config.n_embd)
# 自注意力机制,使用 CausalSelfAttention 类进行定义
self.attn = CausalSelfAttention(config)
# MLP 部分
# 第一个线性层,将 embedding 维度转换为 4 倍的 embedding 维度
self.l1 = nn.Linear(config.n_embd, 4 * config.n_embd)
# GELU 激活函数
self.act = nn.GELU()
# 第二个线性层,将 4 倍的 embedding 维度转换回原始 embedding 维度
self.l2 = nn.Linear(4 * config.n_embd, config.n_embd)
# Dropout 层,以指定的概率进行神经元丢弃,防止过拟合
self.drop = nn.Dropout(config.resid_pdrop)
# 前向传播函数
def forward(
self,
hidden_states: Optional[Tuple[torch.FloatTensor]],
layer_past: Optional[Tuple[torch.Tensor]] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
):
# 保存输入的 residual 连接
residual = hidden_states
# 第一个 Layer Normalization 层
hidden_states = self.ln1(hidden_states)
# 自注意力机制的前向传播
attn_outputs = self.attn(
hidden_states, layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions
)
attn_output = attn_outputs[0] # 注意力输出
outputs = attn_outputs[1:] # 其他输出(如果有)
# 残差连接
hidden_states = attn_output + residual
# 保存当前的 residual 连接
residual = hidden_states
# 第二个 Layer Normalization 层
hidden_states = self.ln2(hidden_states)
# MLP 的线性层1
hidden_states = self.l1(hidden_states)
# 使用 GELU 激活函数
hidden_states = self.act(hidden_states)
# MLP 的线性层2
hidden_states = self.l2(hidden_states)
# 残差连接和 Dropout
hidden_states = residual + self.drop(hidden_states)
# 如果使用缓存,将当前隐藏状态输出保存到 outputs 中
if use_cache:
outputs = (hidden_states,) + outputs
else:
outputs = (hidden_states,) + outputs[1:]
# 返回最终的输出结果
return outputs
# 使用装饰器添加文档字符串,描述了 TrajectoryTransformerModel 类的作用
@add_start_docstrings(
"The bare TrajectoryTransformer Model transformer outputting raw hidden-states without any specific head on top.",
TRAJECTORY_TRANSFORMER_START_DOCSTRING,
)
# 定义 TrajectoryTransformerModel 类,继承自 TrajectoryTransformerPreTrainedModel 类
class TrajectoryTransformerModel(TrajectoryTransformerPreTrainedModel):
"""the full GPT language model, with a context size of block_size"""
def __init__(self, config):
# 调用父类构造函数,初始化模型配置
super().__init__(config)
# 输入嵌入层,将输入映射到指定维度的向量空间,考虑停止标记
self.tok_emb = nn.Embedding(config.vocab_size * config.transition_dim + 1, config.n_embd)
# 位置嵌入层,用于表示序列中每个位置的信息
self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
# Dropout层,用于随机失活以防止过拟合
self.drop = nn.Dropout(config.embd_pdrop)
# Transformer块列表,用于处理序列信息
self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
# Decoder头部的LayerNorm,用于标准化解码器输出
self.ln_f = nn.LayerNorm(config.n_embd)
# Decoder头部的线性层,用于生成最终的预测结果
self.head = EinLinear(config.transition_dim, config.n_embd, config.vocab_size + 1, bias=False)
# 词汇表大小
self.vocab_size = config.vocab_size
# 停止标记的值
self.stop_token = config.vocab_size * config.transition_dim
# 块大小
self.block_size = config.block_size
# 观测维度
self.observation_dim = config.observation_dim
# 动作维度
self.action_dim = config.action_dim
# 转移维度
self.transition_dim = config.transition_dim
# 嵌入维度
self.embedding_dim = config.n_embd
# 动作权重
self.action_weight = config.action_weight
# 奖励权重
self.reward_weight = config.reward_weight
# 值权重
self.value_weight = config.value_weight
# 是否使用梯度检查点
self.gradient_checkpointing = False
# 执行初始化后处理
self.post_init()
def get_block_size(self):
# 返回块大小
return self.block_size
def offset_tokens(self, trajectories):
# 计算序列长度
_, sequence_length = trajectories.shape
# 计算状态数量
n_states = int(np.ceil(sequence_length / self.transition_dim))
# 计算偏移量,使每个状态的起始标记不同
offsets = torch.arange(self.transition_dim) * self.vocab_size
offsets = offsets.repeat(n_states).to(trajectories.device)
# 应用偏移量到轨迹数据,将停止标记替换为预定义的停止标记值
offset_trajectories = trajectories + offsets[:sequence_length]
offset_trajectories[trajectories == self.vocab_size] = self.stop_token
return offset_trajectories
def pad_to_full_observation(self, hidden_states):
# 获取批处理大小、序列长度、嵌入维度
batch_size, sequence_length, _ = hidden_states.shape
# 计算需要填充的数量,使序列长度能够被转移维度整除
n_pad = (self.transition_dim - sequence_length % self.transition_dim) % self.transition_dim
# 创建填充的张量,维度为 [batch_size x n_pad x embedding_dim]
padding = torch.zeros(batch_size, n_pad, self.embedding_dim, device=hidden_states.device)
# 将填充后的序列连接到隐藏状态中,形成 [batch_size x padded_sequence_length' x embedding_dim]
hidden_states_pad = torch.cat([hidden_states, padding], dim=1)
# 将填充后的序列重新组织为 [batch_size*sequence_length'/transition_dim x transition_dim x embedding_dim]
hidden_states_pad = hidden_states_pad.view(-1, self.transition_dim, self.embedding_dim)
return hidden_states_pad, n_pad
# 定义一个方法 `forward`,用于模型的前向传播
def forward(
# 输入参数 trajectories:轨迹数据,可选的长整型张量,默认为 None
trajectories: Optional[torch.LongTensor] = None,
# 输入参数 past_key_values:过去的键-值对元组,可选的张量元组,默认为 None
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
# 输入参数 targets:目标数据,可选的浮点张量,默认为 None
targets: Optional[torch.FloatTensor] = None,
# 输入参数 attention_mask:注意力掩码,可选的浮点张量,默认为 None
attention_mask: Optional[torch.FloatTensor] = None,
# 输入参数 use_cache:是否使用缓存,可选的布尔值,默认为 None
use_cache: Optional[bool] = None,
# 输入参数 output_attentions:是否输出注意力权重,可选的布尔值,默认为 None
output_attentions: Optional[bool] = None,
# 输入参数 output_hidden_states:是否输出隐藏状态,可选的布尔值,默认为 None
output_hidden_states: Optional[bool] = None,
# 输入参数 return_dict:是否返回字典格式的输出,可选的布尔值,默认为 None
return_dict: Optional[bool] = None,
# 引入类型检查工具
from typing import TYPE_CHECKING
# 引入自定义的异常:依赖未安装异常和懒加载模块
from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
# 定义模块的导入结构
_import_structure = {
"configuration_trajectory_transformer": [
"TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", # 预训练配置文件映射
"TrajectoryTransformerConfig", # 轨迹变换器配置类
],
}
# 检查是否存在 Torch 库,如果不存在则引发异常
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 如果 Torch 可用,则添加以下模块到导入结构中
_import_structure["modeling_trajectory_transformer"] = [
"TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", # 预训练模型存档列表
"TrajectoryTransformerModel", # 轨迹变换器模型类
"TrajectoryTransformerPreTrainedModel", # 轨迹变换器预训练模型类
"load_tf_weights_in_trajectory_transformer", # 在轨迹变换器中加载 TensorFlow 权重
]
# 如果正在进行类型检查
if TYPE_CHECKING:
# 从配置模块中导入所需的符号
from .configuration_trajectory_transformer import (
TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, # 预训练配置文件映射
TrajectoryTransformerConfig, # 轨迹变换器配置类
)
# 再次检查 Torch 库的可用性
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 从建模模块中导入所需的符号
from .modeling_trajectory_transformer import (
TRAJECTORY_TRANSFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, # 预训练模型存档列表
TrajectoryTransformerModel, # 轨迹变换器模型类
TrajectoryTransformerPreTrainedModel, # 轨迹变换器预训练模型类
load_tf_weights_in_trajectory_transformer, # 在轨迹变换器中加载 TensorFlow 权重
)
# 如果不是在类型检查模式下
else:
import sys
# 将当前模块设置为懒加载模块,以便在需要时才加载其内容
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\deprecated\transfo_xl\configuration_transfo_xl.py
# coding=utf-8
# 定义版权信息和许可证,此处使用Apache License 2.0
# 这个文件包含了Transformer XL的配置信息
# 从transformers模块中导入PretrainedConfig类
from ....configuration_utils import PretrainedConfig
# 从utils模块中导入logging函数
from ....utils import logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 定义一个字典,映射预训练模型名称到其配置文件的URL
TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/config.json",
}
# TransfoXLConfig类,继承自PretrainedConfig
class TransfoXLConfig(PretrainedConfig):
"""
这是一个配置类,用于存储[`TransfoXLModel`]或[`TFTransfoXLModel`]的配置。它被用来根据指定的参数实例化
Transformer-XL模型,定义模型架构。使用默认参数实例化配置将得到类似于TransfoXL
[transfo-xl/transfo-xl-wt103](https://huggingface.co/transfo-xl/transfo-xl-wt103)架构的配置。
配置对象继承自[`PretrainedConfig`],可以用来控制模型的输出。阅读[`PretrainedConfig`]的文档获取更多信息。
示例:
```
>>> from transformers import TransfoXLConfig, TransfoXLModel
>>> # 初始化一个Transformer XL配置
>>> configuration = TransfoXLConfig()
>>> # 使用配置初始化一个模型(随机权重)
>>> model = TransfoXLModel(configuration)
>>> # 访问模型配置
>>> configuration = model.config
```
"""
# 模型类型
model_type = "transfo-xl"
# 推理阶段要忽略的键
keys_to_ignore_at_inference = ["mems"]
# 属性映射,将旧属性名映射到新属性名
attribute_map = {
"n_token": "vocab_size",
"hidden_size": "d_model",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
# 初始化函数,用于初始化Transformer-XL模型的各种参数和属性
def __init__(
self,
vocab_size=267735, # 词汇表大小,默认为267735
cutoffs=[20000, 40000, 200000], # 截断点列表,默认为[20000, 40000, 200000]
d_model=1024, # 模型的维度,默认为1024
d_embed=1024, # 嵌入层的维度,默认为1024
n_head=16, # 注意力头的数量,默认为16
d_head=64, # 每个注意力头的维度,默认为64
d_inner=4096, # Transformer中全连接层的维度,默认为4096
div_val=4, # 嵌入分割因子,默认为4
pre_lnorm=False, # 是否在层归一化前应用Dropout,默认为False
n_layer=18, # Transformer的层数,默认为18
mem_len=1600, # Transformer-XL中的记忆长度,默认为1600
clamp_len=1000, # 用于限制自注意力长度的上限,默认为1000
same_length=True, # 自注意力中每个位置是否具有相同的长度,默认为True
proj_share_all_but_first=True, # 是否共享所有投影层的参数除了第一个,默认为True
attn_type=0, # 注意力类型的标识,默认为0
sample_softmax=-1, # 是否对softmax进行采样的标识,默认为-1(不采样)
adaptive=True, # 是否使用自适应计算,如自适应softmax,默认为True
dropout=0.1, # 全局Dropout的比例,默认为0.1
dropatt=0.0, # Attention分数Dropout的比例,默认为0.0
untie_r=True, # 是否对R矩阵进行解绑,默认为True
init="normal", # 初始化参数的方法,默认为"normal"
init_range=0.01, # 参数初始化的范围,默认为0.01
proj_init_std=0.01, # 投影层初始化的标准差,默认为0.01
init_std=0.02, # 参数初始化的标准差,默认为0.02
layer_norm_epsilon=1e-5, # 层归一化中的epsilon,默认为1e-5
eos_token_id=0, # EOS(结束符)的token ID,默认为0
**kwargs, # 其他关键字参数,用于传递给父类初始化函数
):
self.vocab_size = vocab_size # 设置词汇表大小
self.cutoffs = [] # 初始化截断点列表
self.cutoffs.extend(cutoffs) # 将输入的截断点列表复制到self.cutoffs中
if proj_share_all_but_first:
self.tie_projs = [False] + [True] * len(self.cutoffs) # 设置投影层是否共享参数的列表
else:
self.tie_projs = [False] + [False] * len(self.cutoffs) # 设置投影层是否共享参数的列表
self.d_model = d_model # 设置模型的维度
self.d_embed = d_embed # 设置嵌入层的维度
self.d_head = d_head # 设置注意力头的维度
self.d_inner = d_inner # 设置全连接层的维度
self.div_val = div_val # 设置嵌入分割因子
self.pre_lnorm = pre_lnorm # 设置是否在层归一化前应用Dropout
self.n_layer = n_layer # 设置Transformer的层数
self.n_head = n_head # 设置注意力头的数量
self.mem_len = mem_len # 设置记忆长度
self.same_length = same_length # 设置自注意力中每个位置是否具有相同的长度
self.attn_type = attn_type # 设置注意力类型
self.clamp_len = clamp_len # 设置限制自注意力长度的上限
self.sample_softmax = sample_softmax # 设置是否对softmax进行采样
self.adaptive = adaptive # 设置是否使用自适应计算
self.dropout = dropout # 设置全局Dropout的比例
self.dropatt = dropatt # 设置Attention分数Dropout的比例
self.untie_r = untie_r # 设置是否对R矩阵进行解绑
self.init = init # 设置初始化参数的方法
self.init_range = init_range # 设置参数初始化的范围
self.proj_init_std = proj_init_std # 设置投影层初始化的标准差
self.init_std = init_std # 设置参数初始化的标准差
self.layer_norm_epsilon = layer_norm_epsilon # 设置层归一化中的epsilon
super().__init__(eos_token_id=eos_token_id, **kwargs) # 调用父类的初始化函数,传递EOS token ID和其他关键字参数
@property
def max_position_embeddings(self):
# 获取最大位置嵌入的属性值
# 根据Transformer-XL文档的描述,该模型没有序列长度限制
logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
return -1 # 返回-1,表示没有长度限制
@max_position_embeddings.setter
def max_position_embeddings(self, value):
# 设置最大位置嵌入的属性值
# 根据Transformer-XL文档的描述,该模型没有序列长度限制,因此设置操作未实现
raise NotImplementedError(
f"The model {self.model_type} is one of the few models that has no sequence length limit."
)