.\models\swin\modeling_tf_swin.py
# coding=utf-8
# 版权 2022 年 Microsoft Research 和 HuggingFace Inc. 团队保留所有权利。
#
# 根据 Apache 许可证 2.0 版本(“许可证”)许可;
# 除非符合许可证,否则不得使用此文件。
# 您可以在以下网址获取许可证副本:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,软件根据“原样”分发,
# 没有任何明示或暗示的保证或条件。
# 有关特定语言的权限,请参阅许可证。
""" TF 2.0 Swin Transformer 模型。"""
from __future__ import annotations
import collections.abc # 导入用于检查抽象基类的标准库模块
import math # 导入数学函数库
import warnings # 导入警告处理模块
from dataclasses import dataclass # 导入用于数据类的装饰器
from functools import partial # 导入用于创建偏函数的函数
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union # 导入类型提示相关模块
import tensorflow as tf # 导入 TensorFlow 库
from ...activations_tf import ACT2FN # 导入 TensorFlow 激活函数映射
from ...modeling_tf_utils import ( # 导入 TensorFlow 模型相关工具函数
TFPreTrainedModel,
TFSequenceClassificationLoss,
get_initializer,
keras,
keras_serializable,
unpack_inputs,
)
from ...tf_utils import shape_list # 导入 TensorFlow 工具函数,用于获取张量形状
from ...utils import ( # 导入通用工具函数
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
logging,
replace_return_docstrings,
)
from .configuration_swin import SwinConfig # 导入 Swin 模型的配置类
logger = logging.get_logger(__name__) # 获取当前模块的日志记录器
# 用于文档的常量和字符串
_CONFIG_FOR_DOC = "SwinConfig" # Swin 配置类的文档字符串
_CHECKPOINT_FOR_DOC = "microsoft/swin-tiny-patch4-window7-224" # 预训练模型检查点的文档字符串
_EXPECTED_OUTPUT_SHAPE = [1, 49, 768] # 预期输出形状的文档字符串
# 用于图像分类的常量和字符串
_IMAGE_CLASS_CHECKPOINT = "microsoft/swin-tiny-patch4-window7-224" # 图像分类模型检查点的文档字符串
_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" # 图像分类预期输出的文档字符串
# Swin 模型的预训练模型存档列表
TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/swin-tiny-patch4-window7-224",
# 查看所有 Swin 模型,请访问 https://huggingface.co/models?filter=swin
]
# drop_path, TFSwinPatchEmbeddings, TFSwinPatchMerging 和 TFSwinDropPath 是 TensorFlow
# 中对 timm 库中 PyTorch 功能的实现。
@dataclass
class TFSwinEncoderOutput(ModelOutput):
"""
Swin 编码器的输出,可能包括隐藏状态和注意力。
"""
# 定义函数参数及其类型注解,用于接收模型的输出
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列的张量。
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
可选参数,当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回,包含模型每一层的隐藏状态的元组。
每个张量的形状为 `(batch_size, sequence_length, hidden_size)`。
包括初始嵌入输出后每个层的模型隐藏状态。
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
可选参数,当 `output_attentions=True` 或 `config.output_attentions=True` 时返回,包含模型每个阶段的注意力权重的元组。
每个张量的形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
在注意力 softmax 后的注意力权重,用于计算自注意力头部的加权平均值。
reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
可选参数,当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回,包含模型每一层的隐藏状态的元组。
每个张量的形状为 `(batch_size, hidden_size, height, width)`。
包括初始嵌入输出后每个层的模型隐藏状态,重塑以包括空间维度。
# 定义一个基于数据类的类 TFSwinModelOutput,继承自 ModelOutput
@dataclass
class TFSwinModelOutput(ModelOutput):
"""
Swin model's outputs that also contains a pooling of the last hidden states.
Args:
last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
Average pooling of the last layer hidden-state.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
`(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
# 定义成员变量并初始化,用来存储模型输出的不同部分
last_hidden_state: tf.Tensor = None
pooler_output: tf.Tensor | None = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
Masked image modeling (MLM) loss.
reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Reconstructed pixel values.
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
`(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
# 初始化属性:损失、重构像素值、隐藏状态、注意力权重和重塑后的隐藏状态,默认为None
loss: tf.Tensor | None = None
reconstruction: tf.Tensor = None
hidden_states: Tuple[tf.Tensor, ...] | None = None
attentions: Tuple[tf.Tensor, ...] | None = None
reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None
@property
def logits(self):
# 发出警告,提醒用户logits属性即将在Transformers的第5个版本中移除,建议使用reconstruction属性获取最终输出
warnings.warn(
"logits attribute is deprecated and will be removed in version 5 of Transformers."
" Please use the reconstruction attribute to retrieve the final output instead.",
FutureWarning,
)
# 返回重构属性作为输出
return self.reconstruction
@dataclass
class TFSwinImageClassifierOutput(ModelOutput):
"""
Swin outputs for image classification.
Args:
loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Classification (or regression if config.num_labels==1) loss.
logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
Classification (or regression if config.num_labels==1) scores (before SoftMax).
hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
`(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
"""
loss: tf.Tensor | None = None # 损失值,如果提供了 `labels` 参数,则返回;用于分类(如果 `config.num_labels==1` 则为回归)的损失。
logits: tf.Tensor = None # 分类(或回归,如果 `config.num_labels==1`)得分,未经 SoftMax 处理,形状为 `(batch_size, config.num_labels)`。
hidden_states: Tuple[tf.Tensor, ...] | None = None # 模型在每一层输出的隐藏状态和初始嵌入输出的元组,形状为 `(batch_size, sequence_length, hidden_size)`。
attentions: Tuple[tf.Tensor, ...] | None = None # 注意力权重,经过注意力 SoftMax 后的结果,用于计算自注意力头部中的加权平均值,形状为 `(batch_size, num_heads, sequence_length, sequence_length)` 的元组。
reshaped_hidden_states: Tuple[tf.Tensor, ...] | None = None # 模型在每一层输出的隐藏状态和初始嵌入输出的重塑版本,包括空间维度,形状为 `(batch_size, hidden_size, height, width)` 的元组。
def window_partition(input_feature: tf.Tensor, window_size: int) -> tf.Tensor:
"""
Partitions the given input into windows.
"""
batch_size, height, width, num_channels = shape_list(input_feature) # 获取输入特征的形状信息
input_feature = tf.reshape(
input_feature,
(batch_size, height // window_size, window_size, width // window_size, window_size, num_channels), # 将输入特征重塑为窗口的形状
)
windows = tf.transpose(input_feature, (0, 1, 3, 2, 4, 5)) # 调整窗口的顺序
windows = tf.reshape(windows, (-1, window_size, window_size, num_channels)) # 将调整顺序后的窗口展平
return windows
def window_reverse(windows: tf.Tensor, window_size: int, height: int, width: int) -> tf.Tensor:
"""
Merges windows to produce higher resolution features.
"""
x = tf.shape(windows)[0] # 获取窗口张量的第一维大小
y = tf.cast(height * width / (window_size * window_size), tf.int32) # 计算合并后特征的大小
batch_size = tf.math.floordiv(x, y) # 计算批次大小
# 将输入的窗口数据重新形状为指定的多维张量,以便进行后续处理
windows = tf.reshape(
windows, (batch_size, height // window_size, width // window_size, window_size, window_size, -1)
)
# 转置张量的维度顺序,以便后续处理更方便
windows = tf.transpose(windows, (0, 1, 3, 2, 4, 5))
# 将张量重新形状为指定的多维张量,以便进行后续处理
windows = tf.reshape(windows, (batch_size, height, width, -1))
# 返回处理后的窗口数据张量
return windows
def drop_path(
input: tf.Tensor, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
) -> tf.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
# 如果 drop_prob 为 0 或者不处于训练模式,则直接返回输入
if drop_prob == 0.0 or not training:
return input
# 计算保留的概率
keep_prob = 1 - drop_prob
# 获取输入张量的形状信息
input_shape = shape_list(input)
# 获取张量的维度数
ndim = len(input_shape)
# 构建一个形状与输入张量相同的随机张量,用于决定每个元素是否保留
shape = [input_shape[0]] + [1] * (ndim - 1) # 适用于不同维度的张量,不仅限于2D卷积网络
random_tensor = tf.random.uniform(shape)
# 将随机张量中小于等于保留概率的元素设置为1.0,其余设置为0.0
random_tensor = tf.where(random_tensor <= keep_prob, 1.0, 0.0)
# 如果保留概率大于0且需要按保留概率进行缩放,则对随机张量进行缩放处理
if keep_prob > 0.0 and scale_by_keep:
random_tensor /= keep_prob
# 返回经过随机路径丢弃后的输入张量
return input * random_tensor
class TFSwinEmbeddings(keras.layers.Layer):
"""
Construct the patch and position embeddings. Optionally, also the mask token.
"""
def __init__(self, config: SwinConfig, use_mask_token: bool = False, **kwargs) -> None:
super().__init__(**kwargs)
# 初始化补丁和位置嵌入
self.patch_embeddings = TFSwinPatchEmbeddings(config, name="patch_embeddings")
# 获取补丁数量和网格大小
self.num_patches = self.patch_embeddings.num_patches
self.patch_grid = self.patch_embeddings.grid_size
self.embed_dim = config.embed_dim
self.use_mask_token = use_mask_token
self.use_absolute_embeddings = config.use_absolute_embeddings
# 层归一化
self.norm = keras.layers.LayerNormalization(name="norm", epsilon=1e-5)
# dropout
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
self.config = config
def build(self, input_shape: tf.TensorShape) -> None:
# 如果需要使用掩码令牌,则添加掩码令牌的权重
if self.use_mask_token:
self.mask_token = self.add_weight(shape=(1, 1, self.embed_dim), initializer="zeros", name="mask_token")
else:
self.mask_token = None
# 如果使用绝对位置嵌入,则添加位置嵌入的权重
if self.use_absolute_embeddings:
self.position_embeddings = self.add_weight(
(1, self.num_patches + 1, self.embed_dim), initializer="zeros", name="positional_embeddings"
)
else:
self.position_embeddings = None
# 如果已经构建,则直接返回
if self.built:
return
self.built = True
# 构建补丁嵌入层、层归一化层和dropout层
if getattr(self, "patch_embeddings", None) is not None:
with tf.name_scope(self.patch_embeddings.name):
self.patch_embeddings.build(None)
if getattr(self, "norm", None) is not None:
with tf.name_scope(self.norm.name):
self.norm.build([None, None, self.config.embed_dim])
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
def call(
self, pixel_values: tf.Tensor, bool_masked_pos: bool = None, training: bool = False
) -> tf.Tensor:
# 留待实现,用于调用该层处理输入张量
pass
) -> Tuple[tf.Tensor, Tuple[int, int]]:
# 计算输入图像的嵌入向量和输出维度
embeddings, output_dimensions = self.patch_embeddings(pixel_values, training=training)
# 对嵌入向量进行归一化处理
embeddings = self.norm(embeddings, training=training)
# 获取嵌入向量的形状信息
batch_size, seq_len, _ = shape_list(embeddings)
# 如果存在需要屏蔽的位置信息
if bool_masked_pos is not None:
# 创建与嵌入向量相同形状的屏蔽标记
mask_tokens = tf.repeat(self.mask_token, batch_size, 0)
mask_tokens = tf.repeat(mask_tokens, seq_len, 1)
# 将屏蔽位置的嵌入向量替换为屏蔽标记
mask = tf.expand_dims(bool_masked_pos, -1)
mask = tf.cast(mask, mask_tokens.dtype)
embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
# 如果存在位置嵌入向量,则将其加到嵌入向量上
if self.position_embeddings is not None:
embeddings = embeddings + self.position_embeddings
# 对嵌入向量进行dropout处理
embeddings = self.dropout(embeddings, training=training)
# 返回处理后的嵌入向量和输出维度
return embeddings, output_dimensions
class TFSwinPatchEmbeddings(keras.layers.Layer):
"""
Image to Patch Embedding.
"""
def __init__(self, config, **kwargs):
super().__init__(**kwargs)
# 从配置中获取图像大小和patch大小
image_size, patch_size = config.image_size, config.patch_size
# 获取通道数和嵌入维度
num_channels, hidden_size = config.num_channels, config.embed_dim
# 如果图像大小和patch大小不是可迭代对象,转换为元组
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 计算patch的数量
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
# 设置类属性
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
# 定义投影层,使用Conv2D将patch映射到隐藏维度空间
self.projection = keras.layers.Conv2D(
filters=hidden_size,
kernel_size=self.patch_size,
strides=self.patch_size,
padding="valid",
name="projection",
)
def maybe_pad(self, pixel_values: tf.Tensor, height: int, width: int) -> tf.Tensor:
# 如果宽度不是patch宽度的整数倍,进行填充
if width % self.patch_size[1] != 0:
pad_values = ((0, 0), (0, 0), (0, 0), (0, self.patch_size[1] - width % self.patch_size[1]))
pixel_values = tf.pad(pixel_values, pad_values)
# 如果高度不是patch高度的整数倍,进行填充
if height % self.patch_size[0] != 0:
pad_values = ((0, 0), (0, 0), (0, self.patch_size[0] - height % self.patch_size[0]), (0, 0))
pixel_values = tf.pad(pixel_values, pad_values)
return pixel_values
def call(self, pixel_values: tf.Tensor, training: bool = False) -> Tuple[tf.Tensor, Tuple[int, int]]:
# 获取输入张量的形状信息
_, num_channels, height, width = shape_list(pixel_values)
# 在动态执行环境下,检查通道数是否与配置中设置的一致
if tf.executing_eagerly() and num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# 如果需要,对输入进行填充,使其可以被self.patch_size整除
pixel_values = self.maybe_pad(pixel_values, height, width)
# 调整输入张量的维度顺序 B,C,H,W -> B,H,W,C
pixel_values = tf.transpose(pixel_values, (0, 2, 3, 1))
# 使用投影层将patch映射到隐藏维度空间
embeddings = self.projection(pixel_values, training=training)
# 调整输出张量的维度顺序 B,H,W,C -> B,C,H,W
embeddings = tf.transpose(embeddings, (0, 3, 1, 2))
# 获取输出张量的形状信息
batch_size, channels, height, width = shape_list(embeddings)
output_dimensions = (height, width)
# 将输出张量reshape为 B,N,C 的形式,其中N为patch的数量
embeddings = tf.reshape(embeddings, (batch_size, channels, -1))
embeddings = tf.transpose(embeddings, (0, 2, 1))
return embeddings, output_dimensions
# 定义一个方法用于构建模型,如果已经构建过则直接返回
def build(self, input_shape=None):
if self.built:
return
# 标记模型已经构建
self.built = True
# 检查是否存在投影层,并在 TensorFlow 的命名空间下构建投影层
if getattr(self, "projection", None) is not None:
with tf.name_scope(self.projection.name):
# 使用投影层的建模方法来构建投影层,传入特定维度的列表
self.projection.build([None, None, None, self.num_channels])
class TFSwinPatchMerging(keras.layers.Layer):
"""
Patch Merging Layer.
Args:
input_resolution (`Tuple[int]`):
Resolution of input feature.
dim (`int`):
Number of input channels.
norm_layer (`keras.layer.Layer`, *optional*, defaults to `keras.layers.LayerNormalization`):
Normalization layer class.
"""
def __init__(
self, input_resolution: Tuple[int, int], dim: int, norm_layer: Optional[Callable] = None, **kwargs
) -> None:
super().__init__(**kwargs)
self.input_resolution = input_resolution # 设置输入特征的分辨率
self.dim = dim # 设置输入通道数
self.reduction = keras.layers.Dense(2 * dim, use_bias=False, name="reduction") # 创建一个稠密层用于特征降维
if norm_layer is None:
# 如果未提供自定义的归一化层,则使用默认的层归一化层,设置标准化的epsilon值与PyTorch相同
self.norm = keras.layers.LayerNormalization(epsilon=1e-5, name="norm")
else:
self.norm = norm_layer(name="norm") # 使用提供的自定义归一化层
def maybe_pad(self, input_feature: tf.Tensor, height: int, width: int) -> tf.Tensor:
should_pad = (height % 2 == 1) or (width % 2 == 1)
if should_pad:
pad_values = ((0, 0), (0, height % 2), (0, width % 2), (0, 0)) # 计算需要填充的值
input_feature = tf.pad(input_feature, pad_values) # 对输入特征进行填充
return input_feature
def call(self, input_feature: tf.Tensor, input_dimensions: Tuple[int, int], training: bool = False) -> tf.Tensor:
height, width = input_dimensions
batch_size, _, num_channels = shape_list(input_feature) # 获取输入特征的形状信息
input_feature = tf.reshape(input_feature, (batch_size, height, width, num_channels)) # 将输入特征重塑为四维张量
input_feature = self.maybe_pad(input_feature, height, width) # 可能对输入特征进行填充,使其尺寸可以被宽度和高度整除
input_feature_0 = input_feature[:, 0::2, 0::2, :] # 提取输入特征的每隔一个像素点的子集
input_feature_1 = input_feature[:, 1::2, 0::2, :] # 提取输入特征的每隔一个像素点的子集
input_feature_2 = input_feature[:, 0::2, 1::2, :] # 提取输入特征的每隔一个像素点的子集
input_feature_3 = input_feature[:, 1::2, 1::2, :] # 提取输入特征的每隔一个像素点的子集
input_feature = tf.concat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1) # 合并这四个子集
input_feature = tf.reshape(
input_feature, (batch_size, -1, 4 * num_channels)
) # 将合并后的特征重塑为三维张量,以便进一步处理
input_feature = self.norm(input_feature, training=training) # 对特征进行归一化
input_feature = self.reduction(input_feature, training=training) # 对特征进行降维
return input_feature
# 定义 build 方法,用于构建模型,如果已经构建过,则直接返回
def build(self, input_shape=None):
# 检查是否已经构建过,如果是则返回,避免重复构建
if self.built:
return
# 将标志设置为已构建
self.built = True
# 如果有指定的 reduction 属性,则在名为 reduction 的命名空间下构建
if getattr(self, "reduction", None) is not None:
with tf.name_scope(self.reduction.name):
# 使用 4 * self.dim 的输入形状来构建 reduction 属性
self.reduction.build([None, None, 4 * self.dim])
# 如果有指定的 norm 属性,则在名为 norm 的命名空间下构建
if getattr(self, "norm", None) is not None:
with tf.name_scope(self.norm.name):
# 使用 4 * self.dim 的输入形状来构建 norm 属性
self.norm.build([None, None, 4 * self.dim])
class TFSwinDropPath(keras.layers.Layer):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: float = None, scale_by_keep: bool = True, **kwargs) -> None:
super(TFSwinDropPath, self).__init__(**kwargs)
self.drop_prob = drop_prob # 初始化丢弃概率
self.scale_by_keep = scale_by_keep # 是否按保留比例缩放
def call(self, input: tf.Tensor, training: bool = False) -> tf.Tensor:
# 调用 drop_path 函数来应用丢弃路径操作
return drop_path(input, self.drop_prob, training, self.scale_by_keep)
class TFSwinSelfAttention(keras.layers.Layer):
def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
super().__init__(**kwargs)
if dim % num_heads != 0:
raise ValueError(
f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
)
self.num_attention_heads = num_heads # 设置注意力头数
self.attention_head_size = int(dim / num_heads) # 计算每个注意力头的大小
self.all_head_size = self.num_attention_heads * self.attention_head_size # 总的 QKV 大小
window_size = config.window_size
self.window_size = (
window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
) # 窗口大小
self.query = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=config.qkv_bias,
name="query",
) # 查询向量的全连接层
self.key = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=config.qkv_bias,
name="key",
) # 键向量的全连接层
self.value = keras.layers.Dense(
self.all_head_size,
kernel_initializer=get_initializer(config.initializer_range),
use_bias=config.qkv_bias,
name="value",
) # 值向量的全连接层
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) # 注意力概率的 dropout 层
def build(self, input_shape: tf.TensorShape) -> None:
# 创建一个用于存储相对位置偏置表的权重变量
self.relative_position_bias_table = self.add_weight(
shape=(((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)), self.num_attention_heads),
initializer="zeros",
name="relative_position_bias_table",
)
# 创建一个用于存储相对位置索引的权重变量,这些索引是窗口内每个标记的相对位置
self.relative_position_index = self.add_weight(
shape=(self.window_size[0] ** 2, self.window_size[1] ** 2),
trainable=False,
dtype=tf.int32,
name="relative_position_index",
)
# 获取窗口内每个标记的成对相对位置索引
coords_h = tf.range(self.window_size[0])
coords_w = tf.range(self.window_size[1])
coords = tf.stack(tf.meshgrid(coords_h, coords_w, indexing="ij"))
coords_flatten = tf.reshape(coords, (shape_list(coords)[0], -1))
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
relative_coords = tf.transpose(relative_coords, (1, 2, 0))
stack_0, stack_1 = tf.unstack(relative_coords, axis=2)
stack_0 += self.window_size[0] - 1
stack_0 *= 2 * self.window_size[1] - 1
stack_1 += self.window_size[1] - 1
relative_coords = tf.stack([stack_0, stack_1], axis=2)
# 计算相对位置索引的总和并分配给相对位置索引变量
self.relative_position_index.assign(tf.cast(tf.reduce_sum(relative_coords, axis=-1), tf.int32))
# 如果已经构建过,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果存在查询、键、值变量,则构建它们的结构
if getattr(self, "query", None) is not None:
with tf.name_scope(self.query.name):
self.query.build([None, None, self.all_head_size])
if getattr(self, "key", None) is not None:
with tf.name_scope(self.key.name):
self.key.build([None, None, self.all_head_size])
if getattr(self, "value", None) is not None:
with tf.name_scope(self.value.name):
self.value.build([None, None, self.all_head_size])
def transpose_for_scores(self, x: tf.Tensor) -> tf.Tensor:
# 调整张量的形状以便计算注意力分数
new_x_shape = shape_list(x)[:-1] + [self.num_attention_heads, self.attention_head_size]
x = tf.reshape(x, new_x_shape)
return tf.transpose(x, (0, 2, 1, 3))
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: bool = False,
training: bool = False,
) -> Tuple[tf.Tensor, ...]:
# 获取隐藏状态的形状信息:批大小、维度等
batch_size, dim, _ = shape_list(hidden_states)
# 对隐藏状态进行查询操作,生成混合的查询层
mixed_query_layer = self.query(hidden_states)
# 使用self.key对隐藏状态进行键的转换,并调整形状以适应注意力得分计算
key_layer = self.transpose_for_scores(self.key(hidden_states))
# 使用self.value对隐藏状态进行值的转换,并调整形状以适应注意力得分计算
value_layer = self.transpose_for_scores(self.value(hidden_states))
# 对混合的查询层进行形状调整,以适应注意力得分计算
query_layer = self.transpose_for_scores(mixed_query_layer)
# 计算查询层与键层之间的点积,得到原始的注意力得分
attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, (0, 1, 3, 2)))
# 对注意力得分进行缩放,以减少数值大小对 softmax 函数计算的影响
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# 根据相对位置索引从相对位置偏置表中获取相对位置偏置
relative_position_bias = tf.gather(
self.relative_position_bias_table, tf.reshape(self.relative_position_index, (-1,))
)
# 调整相对位置偏置的形状以匹配注意力得分的形状
relative_position_bias = tf.reshape(
relative_position_bias,
(self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1),
)
# 转置相对位置偏置的维度顺序,以便与注意力得分相加
relative_position_bias = tf.transpose(relative_position_bias, (2, 0, 1))
attention_scores = attention_scores + tf.expand_dims(relative_position_bias, 0)
# 如果存在注意力掩码,则应用它
if attention_mask is not None:
# 获取注意力掩码的形状信息
mask_shape = shape_list(attention_mask)[0]
# 调整注意力得分的形状以匹配掩码的形状
attention_scores = tf.reshape(
attention_scores, (batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim)
)
# 扩展注意力掩码的维度以匹配注意力得分
attention_mask = tf.expand_dims(attention_mask, 1)
attention_mask = tf.expand_dims(attention_mask, 0)
# 将注意力掩码加到注意力得分上
attention_scores = attention_scores + attention_mask
# 重新调整注意力得分的形状
attention_scores = tf.reshape(attention_scores, (-1, self.num_attention_heads, dim, dim))
# 对注意力得分进行 softmax 归一化,得到注意力概率
attention_probs = tf.nn.softmax(attention_scores, axis=-1)
# 使用 dropout 进行注意力概率的随机失活,仅在训练时生效
attention_probs = self.dropout(attention_probs, training=training)
# 如果指定了头部掩码,则应用头部掩码
if head_mask is not None:
attention_probs = attention_probs * head_mask
# 计算上下文层,将注意力概率乘以值层
context_layer = tf.matmul(attention_probs, value_layer)
# 调整上下文层的维度顺序,以适应输出格式
context_layer = tf.transpose(context_layer, (0, 2, 1, 3))
# 调整上下文层的形状以匹配所有头部的输出大小
new_context_layer_shape = shape_list(context_layer)[:-2] + [
self.all_head_size,
]
context_layer = tf.reshape(context_layer, new_context_layer_shape)
# 输出结果,包括上下文层和可能的注意力概率
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# 定义一个名为 TFSwinSelfOutput 的自定义层,继承自 Keras 的 Layer 类
class TFSwinSelfOutput(keras.layers.Layer):
# 初始化方法,接受 SwinConfig 对象、整数 dim 和额外的关键字参数
def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
super().__init__(**kwargs)
# 创建一个 Dense 层,用于线性变换,输出维度为 dim
self.dense = keras.layers.Dense(dim, name="dense")
# 创建一个 Dropout 层,使用配置中的 dropout 概率
self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob, name="dropout")
self.dim = dim
# 前向传播方法,接受 hidden_states(输入张量)、input_tensor(输入张量)、training(布尔值,指示是否处于训练模式)
def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
# 将输入通过 Dense 层进行线性变换
hidden_states = self.dense(hidden_states)
# 对线性变换后的结果进行 Dropout 操作
hidden_states = self.dropout(hidden_states, training=training)
return hidden_states
# 构建方法,用于构建层的内部结构
def build(self, input_shape=None):
# 如果层已经构建,则直接返回
if self.built:
return
self.built = True
# 如果存在 Dense 层,则构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
self.dense.build([None, None, self.dim])
# 如果存在 Dropout 层,则构建该层
if getattr(self, "dropout", None) is not None:
with tf.name_scope(self.dropout.name):
self.dropout.build(None)
# 定义一个名为 TFSwinAttention 的自定义层,继承自 Keras 的 Layer 类
class TFSwinAttention(keras.layers.Layer):
# 初始化方法,接受 SwinConfig 对象、整数 dim、整数 num_heads 和额外的关键字参数
def __init__(self, config: SwinConfig, dim: int, num_heads: int, **kwargs) -> None:
super().__init__(**kwargs)
# 创建一个 TFSwinSelfAttention 层,用于处理注意力机制
self.self = TFSwinSelfAttention(config, dim, num_heads, name="self")
# 创建一个 TFSwinSelfOutput 层,用于处理自注意力输出
self.self_output = TFSwinSelfOutput(config, dim, name="output")
# 初始化一个空集合,用于存储要剪枝的注意力头
self.pruned_heads = set()
# 剪枝注意力头的方法,抛出未实现异常
def prune_heads(self, heads):
"""
Prunes heads of the model. See base class PreTrainedModel heads: dict of {layer_num: list of heads to prune in
this layer}
"""
raise NotImplementedError
# 前向传播方法,接受 hidden_states(输入张量)、attention_mask(注意力掩码张量)、head_mask(头部掩码张量)、
# output_attentions(布尔值,指示是否输出注意力矩阵)、training(布尔值,指示是否处于训练模式)
def call(
self,
hidden_states: tf.Tensor,
attention_mask: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: bool = False,
training: bool = False,
) -> tf.Tensor:
# 使用 self 层处理输入的 hidden_states,得到自注意力输出 self_outputs
self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions, training=training)
# 使用 self_output 层处理 self_outputs 和原始 hidden_states,得到注意力输出 attention_output
attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
# 构建输出元组 outputs,包括注意力输出和可能的注意力矩阵(如果有的话)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
# 构建方法,用于构建层的内部结构
def build(self, input_shape=None):
# 如果层已经构建,则直接返回
if self.built:
return
self.built = True
# 如果存在 self 层,则构建该层
if getattr(self, "self", None) is not None:
with tf.name_scope(self.self.name):
self.self.build(None)
# 如果存在 self_output 层,则构建该层
if getattr(self, "self_output", None) is not None:
with tf.name_scope(self.self_output.name):
self.self_output.build(None)
# 定义一个名为 TFSwinIntermediate 的自定义层,继承自 Keras 的 Layer 类
class TFSwinIntermediate(keras.layers.Layer):
# 初始化方法,用于创建一个新的实例
def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
# 调用父类(tf.keras.layers.Layer)的初始化方法
super().__init__(**kwargs)
# 创建一个全连接层,输出维度为 config.mlp_ratio * dim,命名为 "dense"
self.dense = keras.layers.Dense(int(config.mlp_ratio * dim), name="dense")
# 根据配置文件中的 hidden_act 参数确定中间激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 将维度信息保存在实例变量 dim 中
self.dim = dim
# 调用方法,定义了该层的正向传播逻辑
def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
# 通过全连接层处理输入的 hidden_states,得到输出 hidden_states
hidden_states = self.dense(hidden_states)
# 使用中间激活函数处理输出 hidden_states
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回处理后的 hidden_states
return hidden_states
# 构建方法,用于构建层的变量(如果尚未构建)
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 设置标志位,表明已经构建过
self.built = True
# 如果存在全连接层 dense,则根据输入形状构建该层
if getattr(self, "dense", None) is not None:
with tf.name_scope(self.dense.name):
# 调用全连接层的 build 方法,指定输入形状 [None, None, self.dim]
self.dense.build([None, None, self.dim])
# 定义一个名为 TFSwinOutput 的自定义层,继承自 keras 的 Layer 类
class TFSwinOutput(keras.layers.Layer):
# 初始化方法,接受 SwinConfig 对象、维度 dim 和其他关键字参数
def __init__(self, config: SwinConfig, dim: int, **kwargs) -> None:
super().__init__(**kwargs)
# 创建一个全连接层 dense,输出维度为 dim,命名为 "dense"
self.dense = keras.layers.Dense(dim, name="dense")
# 创建一个 Dropout 层,使用 SwinConfig 中的隐藏层 Dropout 概率作为参数,命名为 "dropout"
self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
# 将传入的 SwinConfig 对象保存到 self.config 中
self.config = config
# 将传入的维度 dim 保存到 self.dim 中
self.dim = dim
# 定义 call 方法,接收隐藏状态 hidden_states 和训练标志 training
def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
# 将隐藏状态输入到全连接层 dense 中,得到输出 hidden_states
hidden_states = self.dense(hidden_states)
# 对输出 hidden_states 应用 Dropout 操作,使用 training 参数控制是否训练模式
hidden_states = self.dropout(hidden_states, training=training)
# 返回经过全连接层和 Dropout 后的 hidden_states
return hidden_states
# 定义 build 方法,用于构建层的参数
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 检查是否存在 self.dense 属性
if getattr(self, "dense", None) is not None:
# 在命名空间 self.dense.name 下,构建全连接层,输入形状为 [None, None, int(self.config.mlp_ratio * self.dim)]
with tf.name_scope(self.dense.name):
self.dense.build([None, None, int(self.config.mlp_ratio * self.dim)])
# 定义一个名为 TFSwinLayer 的自定义层,继承自 keras 的 Layer 类
class TFSwinLayer(keras.layers.Layer):
# 初始化方法,接受 config 对象、维度 dim、输入分辨率 input_resolution、注意力头数 num_heads 和其他关键字参数
def __init__(
self, config, dim, input_resolution: Tuple[int, int], num_heads: int, shift_size: int = 0, **kwargs
) -> None:
super().__init__(**kwargs)
# 设置前馈传输块的大小为 config 中的 chunk_size_feed_forward
self.chunk_size_feed_forward = config.chunk_size_feed_forward
# 计算输入分辨率的最小值
min_res = tf.reduce_min(input_resolution)
# 窗口大小为最小分辨率和 config 中的 window_size 的较小值
self.window_size = min_res if min_res <= config.window_size else config.window_size
# 如果最小分辨率小于等于窗口大小,则 shift_size 设为 0;否则使用传入的 shift_size
self.shift_size = 0 if min_res <= self.window_size else shift_size
# 保存输入分辨率到 self.input_resolution 中
self.input_resolution = input_resolution
# 创建 LayerNormalization 层,epsilon 使用 config 中的 layer_norm_eps,命名为 "layernorm_before"
self.layernorm_before = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_before")
# 创建注意力机制层 TFSwinAttention,使用传入的 config、dim 和 num_heads,命名为 "attention"
self.attention = TFSwinAttention(config, dim, num_heads, name="attention")
# 如果 config 中的 drop_path_rate 大于 0.0,则创建 TFSwinDropPath 层,命名为 "drop_path",否则使用线性激活层
self.drop_path = (
TFSwinDropPath(config.drop_path_rate, name="drop_path")
if config.drop_path_rate > 0.0
else keras.layers.Activation("linear", name="drop_path")
)
# 创建 LayerNormalization 层,epsilon 使用 config 中的 layer_norm_eps,命名为 "layernorm_after"
self.layernorm_after = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm_after")
# 创建 Swin 模型的中间层 TFSwinIntermediate,使用 config 和 dim,命名为 "intermediate"
self.intermediate = TFSwinIntermediate(config, dim, name="intermediate")
# 创建 Swin 模型的输出层 TFSwinOutput,使用 config 和 dim,命名为 "output"
self.swin_output = TFSwinOutput(config, dim, name="output")
# 保存维度 dim 到 self.dim 中
self.dim = dim
def get_attn_mask(self, height: int, width: int, window_size: int, shift_size: int) -> tf.Tensor | None:
# 创建一个全零的图像掩码,形状为(height, width)
img_mask = tf.zeros((height, width))
# 定义高度和宽度的切片范围,用于创建注意力掩码
height_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
width_slices = ((0, -window_size), (-window_size, -shift_size), (-shift_size, -1))
# 计算 SW-MSA 的注意力掩码
if shift_size > 0:
count = 0
for height_slice in height_slices:
for width_slice in width_slices:
# 计算当前切片内的索引
height_inds = tf.range(height_slice[0] % height, height_slice[1] % height + 1)
width_inds = tf.range(width_slice[0] % width, width_slice[1] % width + 1)
indices = tf.reshape(tf.stack(tf.meshgrid(height_inds, width_inds), axis=-1), (-1, 2))
if len(indices) >= 1:
# 将更新值为 count 的掩码应用到图像掩码的对应位置
updates = tf.ones((len(indices),), dtype=img_mask.dtype) * count
img_mask = tf.tensor_scatter_nd_update(img_mask, indices, updates)
count += 1
# 将图像掩码扩展维度以适应后续计算要求
img_mask = tf.expand_dims(img_mask, -1)
img_mask = tf.expand_dims(img_mask, 0)
# 对图像掩码进行窗口划分,用于后续的注意力计算
mask_windows = window_partition(img_mask, window_size)
mask_windows = tf.reshape(mask_windows, (-1, window_size * window_size))
# 构建注意力掩码,对角线上的元素为 -100.0,其余为 0.0
attn_mask = tf.expand_dims(mask_windows, 1) - tf.expand_dims(mask_windows, 2)
attn_mask = tf.where(attn_mask != 0, float(-100.0), attn_mask)
attn_mask = tf.where(attn_mask == 0, float(0.0), attn_mask)
return attn_mask
def maybe_pad(
self, hidden_states: tf.Tensor, window_size: int, height: int, width: int
) -> Tuple[tf.Tensor, tf.Tensor]:
# 计算需要在图像状态中填充的右边和底部的像素数
pad_right = (window_size - width % window_size) % window_size
pad_bottom = (window_size - height % window_size) % window_size
# 定义填充的数值,填充右边和底部,保持其他维度不变
pad_values = [[0, 0], [0, pad_bottom], [0, pad_right], [0, 0]]
# 在隐藏状态张量上应用填充
hidden_states = tf.pad(hidden_states, pad_values)
# 将填充值转换为一维张量返回
pad_values = tf.reshape(pad_values, (-1,))
return hidden_states, pad_values
def call(
self,
hidden_states: tf.Tensor,
input_dimensions: Tuple[int, int],
head_mask: tf.Tensor | None = None,
output_attentions: bool = False,
training: bool = False,
):
# 神经网络层的调用函数,处理输入的隐藏状态和其他参数
) -> tf.Tensor:
# 如果窗口大小大于输入分辨率,则不分割窗口
min_res = tf.reduce_min(input_dimensions) # 计算输入维度的最小值
shift_size = 0 if min_res <= self.window_size else self.shift_size # 如果最小分辨率小于等于窗口大小,则不进行移动;否则使用预设的移动大小
window_size = min_res if min_res <= self.window_size else self.window_size # 窗口大小取决于最小分辨率和设定的窗口大小
height, width = input_dimensions # 解包输入维度
batch_size, _, channels = shape_list(hidden_states) # 获取隐藏状态的批处理大小、高度、宽度和通道数
shortcut = hidden_states # 备份隐藏状态
hidden_states = self.layernorm_before(hidden_states, training=training) # 应用层归一化到隐藏状态之前
hidden_states = tf.reshape(hidden_states, (batch_size, height, width, channels)) # 重新调整隐藏状态的形状为(batch_size, height, width, channels)
hidden_states, pad_values = self.maybe_pad(hidden_states, window_size, height, width) # 可能对隐藏状态进行填充,使其成为窗口大小的倍数
_, height_pad, width_pad, _ = shape_list(hidden_states) # 获取调整后隐藏状态的形状
# 循环移位
if shift_size > 0:
shifted_hidden_states = tf.roll(hidden_states, shift=(-shift_size, -shift_size), axis=(1, 2)) # 在轴(1, 2)上执行负移位
else:
shifted_hidden_states = hidden_states # 否则不进行移位
# 分割窗口
hidden_states_windows = window_partition(shifted_hidden_states, window_size) # 将移位后的隐藏状态分割成窗口
hidden_states_windows = tf.reshape(hidden_states_windows, (-1, window_size * window_size, channels)) # 重新调整窗口的形状为(-1, window_size * window_size, channels)
attn_mask = self.get_attn_mask(
height=height_pad, width=width_pad, window_size=window_size, shift_size=shift_size
) # 获取注意力掩码
attention_outputs = self.attention(
hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions, training=training
) # 应用自注意力机制
attention_output = attention_outputs[0] # 提取注意力输出的第一个元素
attention_windows = tf.reshape(attention_output, (-1, window_size, window_size, channels)) # 重新调整注意力输出的形状为(-1, window_size, window_size, channels)
shifted_windows = window_reverse(attention_windows, window_size, height_pad, width_pad) # 反转窗口
# 反向循环移位
if shift_size > 0:
attention_windows = tf.roll(shifted_windows, shift=(shift_size, shift_size), axis=(1, 2)) # 在轴(1, 2)上执行正移位
else:
attention_windows = shifted_windows # 否则不进行移位
was_padded = pad_values[3] > 0 or pad_values[5] > 0 # 检查是否对隐藏状态进行了填充
if was_padded:
attention_windows = attention_windows[:, :height, :width, :] # 如果进行了填充,则截取有效部分
attention_windows = tf.reshape(attention_windows, (batch_size, height * width, channels)) # 重新调整注意力窗口的形状为(batch_size, height * width, channels)
hidden_states = shortcut + self.drop_path(attention_windows, training=training) # 添加残差连接和DropPath
layer_output = self.layernorm_after(hidden_states, training=training) # 应用层归一化到隐藏状态之后
layer_output = self.intermediate(layer_output) # 应用中间层变换
layer_output = hidden_states + self.swin_output(layer_output, training=training) # 添加Swin Transformer的输出
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) # 构造输出元组
return layer_outputs # 返回层输出
# 构建模型的方法,用于设置层的输入形状并构建层的参数
def build(self, input_shape=None):
# 如果已经构建过,则直接返回,避免重复构建
if self.built:
return
# 将构建标志设置为已构建
self.built = True
# 如果存在layernorm_before属性,则构建layernorm_before层
if getattr(self, "layernorm_before", None) is not None:
# 使用layernorm_before层的名字作为命名空间
with tf.name_scope(self.layernorm_before.name):
# 构建layernorm_before层,设置输入形状为[None, None, self.dim]
self.layernorm_before.build([None, None, self.dim])
# 如果存在attention属性,则构建attention层
if getattr(self, "attention", None) is not None:
# 使用attention层的名字作为命名空间
with tf.name_scope(self.attention.name):
# 构建attention层,输入形状为None(表示不确定的形状)
self.attention.build(None)
# 如果存在drop_path属性,则构建drop_path层
if getattr(self, "drop_path", None) is not None:
# 使用drop_path层的名字作为命名空间
with tf.name_scope(self.drop_path.name):
# 构建drop_path层,输入形状为None
self.drop_path.build(None)
# 如果存在layernorm_after属性,则构建layernorm_after层
if getattr(self, "layernorm_after", None) is not None:
# 使用layernorm_after层的名字作为命名空间
with tf.name_scope(self.layernorm_after.name):
# 构建layernorm_after层,设置输入形状为[None, None, self.dim]
self.layernorm_after.build([None, None, self.dim])
# 如果存在intermediate属性,则构建intermediate层
if getattr(self, "intermediate", None) is not None:
# 使用intermediate层的名字作为命名空间
with tf.name_scope(self.intermediate.name):
# 构建intermediate层,输入形状为None
self.intermediate.build(None)
# 如果存在swin_output属性,则构建swin_output层
if getattr(self, "swin_output", None) is not None:
# 使用swin_output层的名字作为命名空间
with tf.name_scope(self.swin_output.name):
# 构建swin_output层,输入形状为None
self.swin_output.build(None)
class TFSwinStage(keras.layers.Layer):
# 定义一个名为 TFSwinStage 的自定义 Keras 层
def __init__(
self,
config: SwinConfig,
dim: int,
input_resolution: Tuple[int, int],
depth: int,
num_heads: int,
drop_path: List[float],
downsample: Optional[Callable],
**kwargs,
) -> None:
super().__init__(**kwargs)
# 初始化函数,接受多个参数,其中包括 Swin 模型的配置、维度、输入分辨率、深度、头数、路径丢弃率等
self.config = config
self.dim = dim
# 创建一个由 TFSwinLayer 实例组成的列表,每个实例代表一个层
self.blocks = [
TFSwinLayer(
config=config,
dim=dim,
input_resolution=input_resolution,
num_heads=num_heads,
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
name=f"blocks.{i}",
)
for i in range(depth)
]
# 如果存在下采样函数,创建下采样层
if downsample is not None:
self.downsample = downsample(
input_resolution,
dim=dim,
norm_layer=partial(keras.layers.LayerNormalization, epsilon=1e-5),
name="downsample",
)
else:
self.downsample = None
# 初始化指向(pointing)为 False
self.pointing = False
# 定义调用函数,处理输入并返回输出
def call(
self,
hidden_states: tf.Tensor,
input_dimensions: Tuple[int, int],
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = False,
training: bool = False,
) -> Tuple[tf.Tensor, ...]:
height, width = input_dimensions
# 遍历所有层,逐层处理隐藏状态
for i, layer_module in enumerate(self.blocks):
layer_head_mask = head_mask[i] if head_mask is not None else None
# 调用每个层的处理函数,获取层的输出
layer_outputs = layer_module(
hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training
)
# 更新隐藏状态为当前层的输出
hidden_states = layer_outputs[0]
# 如果存在下采样层,对隐藏状态进行下采样操作
if self.downsample is not None:
height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
output_dimensions = (height, width, height_downsampled, width_downsampled)
hidden_states = self.downsample(layer_outputs[0], input_dimensions, training=training)
else:
output_dimensions = (height, width, height, width)
# 组装阶段的输出,包括隐藏状态和输出尺寸
stage_outputs = (hidden_states, output_dimensions)
# 如果需要输出注意力权重,则将它们添加到阶段的输出中
if output_attentions:
stage_outputs += layer_outputs[1:]
return stage_outputs
# 定义构建函数,在第一次调用时构建层
def build(self, input_shape=None):
if self.built:
return
self.built = True
# 如果存在下采样层,构建该层
if getattr(self, "downsample", None) is not None:
with tf.name_scope(self.downsample.name):
self.downsample.build(None)
# 对每个层调用构建函数,构建所有的子层
if getattr(self, "blocks", None) is not None:
for layer in self.blocks:
with tf.name_scope(layer.name):
layer.build(None)
class TFSwinEncoder(keras.layers.Layer):
# 定义一个名为 TFSwinEncoder 的自定义 Keras 层
# 初始化函数,接受一个SwinTransformer的配置对象和一个网格大小的元组作为参数
def __init__(self, config: SwinConfig, grid_size: Tuple[int, int], **kwargs):
# 调用父类的初始化函数
super().__init__(**kwargs)
# 计算SwinTransformer模型的层数
self.num_layers = len(config.depths)
# 保存传入的配置对象
self.config = config
# 计算每一层的DropPath率,并转换为列表
dpr = list((tf.linspace(0, 1, sum(config.depths)) * config.drop_path_rate).numpy())
# 创建SwinTransformer的各个层
self.layers = [
TFSwinStage(
config=config,
# 计算当前层的维度
dim=int(config.embed_dim * 2**i_layer),
# 计算当前层的输入分辨率
input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
# 设置当前层的深度
depth=config.depths[i_layer],
# 设置当前层的头数
num_heads=config.num_heads[i_layer],
# 为当前层设置DropPath率
drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
# 如果当前层不是最后一层,设置下采样方法;否则为None
downsample=TFSwinPatchMerging if (i_layer < self.num_layers - 1) else None,
# 设置当前层的名称
name=f"layers.{i_layer}",
)
# 对每一层进行迭代
for i_layer in range(self.num_layers)
]
# 默认关闭梯度检查点
self.gradient_checkpointing = False
# 模型调用函数,接受隐藏状态张量、输入维度元组等多个参数
def call(
self,
hidden_states: tf.Tensor,
input_dimensions: Tuple[int, int],
head_mask: tf.Tensor | None = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
return_dict: bool = True,
training: bool = False,
) -> Union[Tuple[tf.Tensor, ...], TFSwinEncoderOutput]:
# 定义函数签名及返回类型,输入为隐藏状态及其他参数,输出为元组或TFSwinEncoderOutput类型
all_input_dimensions = ()
# 初始化空元组,用于存储所有输入维度信息
all_hidden_states = () if output_hidden_states else None
# 如果需要输出隐藏状态,则初始化空元组,否则置为None
all_reshaped_hidden_states = () if output_hidden_states else None
# 如果需要输出隐藏状态,则初始化空元组,否则置为None
all_self_attentions = () if output_attentions else None
# 如果需要输出注意力权重,则初始化空元组,否则置为None
if output_hidden_states:
batch_size, _, hidden_size = shape_list(hidden_states)
# 获取隐藏状态的批量大小、高、宽、通道数信息
# 重排形状为 b (h w) c -> b c h w
reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size))
reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2))
# 将形状调整为 b c h w,并进行转置以匹配预期的维度顺序
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
# 将隐藏状态及其重排后的形状信息添加到对应的元组中
for i, layer_module in enumerate(self.layers):
# 遍历self.layers中的每一层模块
layer_head_mask = head_mask[i] if head_mask is not None else None
# 获取当前层的注意力头遮罩,如果未提供则置为None
layer_outputs = layer_module(
hidden_states, input_dimensions, layer_head_mask, output_attentions, training=training
)
# 调用当前层模块的前向传播方法,计算层的输出结果
hidden_states = layer_outputs[0]
# 更新隐藏状态为当前层输出的第一个元素(通常是最终的隐藏状态)
output_dimensions = layer_outputs[1]
# 获取当前层输出的维度信息
input_dimensions = (output_dimensions[-2], output_dimensions[-1])
# 更新输入维度为当前层输出的高和宽信息
all_input_dimensions += (input_dimensions,)
# 将更新后的输入维度信息添加到all_input_dimensions中
if output_hidden_states:
batch_size, _, hidden_size = shape_list(hidden_states)
# 获取隐藏状态的批量大小、高、宽、通道数信息
# 重排形状为 b (h w) c -> b c h w
reshaped_hidden_state = tf.reshape(hidden_states, (batch_size, *input_dimensions, hidden_size))
reshaped_hidden_state = tf.transpose(reshaped_hidden_state, (0, 3, 1, 2))
# 将形状调整为 b c h w,并进行转置以匹配预期的维度顺序
all_hidden_states += (hidden_states,)
all_reshaped_hidden_states += (reshaped_hidden_state,)
# 将隐藏状态及其重排后的形状信息添加到对应的元组中
if output_attentions:
all_self_attentions += layer_outputs[2:]
# 如果需要输出注意力权重,则将当前层输出中的注意力权重信息添加到all_self_attentions中
if not return_dict:
# 如果不需要返回字典格式的输出结果
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 返回所有非空的结果组成的元组
return TFSwinEncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
reshaped_hidden_states=all_reshaped_hidden_states,
)
# 返回以TFSwinEncoderOutput格式封装的输出结果
def build(self, input_shape=None):
# 定义build方法,用于构建模型层次结构
if self.built:
# 如果模型已构建完成,则直接返回
return
self.built = True
# 将模型标记为已构建
if getattr(self, "layers", None) is not None:
# 如果存在模型层列表
for layer in self.layers:
# 遍历每一层
with tf.name_scope(layer.name):
# 使用层的名称创建命名空间
layer.build(None)
# 调用层的build方法构建层次结构
class TFSwinPreTrainedModel(TFPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 使用 SwinConfig 类作为模型的配置类
config_class = SwinConfig
# 基础模型的前缀名为 "swin"
base_model_prefix = "swin"
# 主输入名称为 "pixel_values"
main_input_name = "pixel_values"
SWIN_START_DOCSTRING = r"""
This model is a Tensorflow
[keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a
regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and
behavior.
Parameters:
config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
SWIN_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
def normalize_data_format(value: str) -> str:
"""
From tensorflow addons
https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/utils/keras_utils.py#L71
"""
# 如果值为 None,则使用 keras 后端的图像数据格式作为值
if value is None:
value = keras.backend.image_data_format()
# 将值转换为小写
data_format = value.lower()
# 如果数据格式不是 "channels_first" 或 "channels_last",则引发 ValueError 异常
if data_format not in {"channels_first", "channels_last"}:
raise ValueError(
'The `data_format` argument must be one of "channels_first", "channels_last". Received: ' + str(value)
)
# 返回标准化后的数据格式
return data_format
class AdaptiveAveragePooling1D(keras.layers.Layer):
"""
Args:
"""
"""
Average 1D Pooling with adaptive kernel size.
output_size: An integer or tuple/list of a single integer, specifying pooled_features.
The new size of output channels.
data_format: A string,
one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs.
`channels_last` corresponds to inputs with shape `(batch, steps, channels)` while `channels_first` corresponds
to inputs with shape `(batch, channels, steps)`.
Input shape:
- If `data_format='channels_last'`: 3D tensor with shape `(batch, steps, channels)`.
- If `data_format='channels_first'`: 3D tensor with shape `(batch, channels, steps)`.
Output shape:
- If `data_format='channels_last'`: 3D tensor with shape `(batch_size, pooled_steps, channels)`.
- If `data_format='channels_first'`: 3D tensor with shape `(batch_size, channels, pooled_steps)`.
Adapted from [tensorflow-addon's adaptive pooling.py](
https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/layers/adaptive_pooling.py#L90-L120
)
"""
# 定义一个平均池化层,支持自适应核大小
class AveragePooling1D(tf.keras.layers.Layer):
def __init__(
self,
output_size: Union[int, Iterable[int]], # 池化后的输出尺寸,可以是整数或整数组成的可迭代对象
reduce_function: Callable = tf.reduce_mean, # 池化使用的函数,默认为平均值池化
data_format: Optional[str] = None, # 数据格式,默认为 None
**kwargs, # 其他参数
) -> None:
self.data_format = normalize_data_format(data_format) # 标准化数据格式
self.reduce_function = reduce_function # 池化函数
self.output_size = (output_size,) if isinstance(output_size, int) else tuple(output_size) # 输出尺寸的元组形式
super().__init__(**kwargs) # 调用父类初始化方法
def call(self, inputs: tf.Tensor, *args) -> None:
bins = self.output_size[0] # 获取输出尺寸中的第一个值作为 bins
if self.data_format == "channels_last":
splits = tf.split(inputs, bins, axis=1) # 在通道维度上分割输入张量
splits = tf.stack(splits, axis=1) # 在第二个维度上堆叠分割后的张量
out_vect = self.reduce_function(splits, axis=2) # 沿着第三个维度对堆叠后的张量进行池化
else:
splits = tf.split(inputs, bins, axis=2) # 在时间步维度上分割输入张量
splits = tf.stack(splits, axis=2) # 在第三个维度上堆叠分割后的张量
out_vect = self.reduce_function(splits, axis=3) # 沿着第四个维度对堆叠后的张量进行池化
return out_vect # 返回池化后的张量
def compute_output_shape(self, input_shape: Iterable[int]) -> tf.TensorShape:
input_shape = tf.TensorShape(input_shape).as_list() # 将输入形状转换为列表形式
if self.data_format == "channels_last":
shape = tf.TensorShape([input_shape[0], self.output_size[0], input_shape[2]]) # 计算输出形状,通道在最后
else:
shape = tf.TensorShape([input_shape[0], input_shape[1], self.output_size[0]]) # 计算输出形状,通道在中间
return shape # 返回输出形状的张量形状对象
def get_config(self) -> Dict[str, Any]:
config = {
"output_size": self.output_size, # 输出尺寸配置
"data_format": self.data_format, # 数据格式配置
}
base_config = super().get_config() # 调用父类配置方法
return {**base_config, **config} # 返回合并后的配置字典
# 定义一个 Keras 自定义层 TFSwinMainLayer,并添加了 keras_serializable 装饰器,使其能够序列化
@keras_serializable
class TFSwinMainLayer(keras.layers.Layer):
# 设置配置类为 SwinConfig
config_class = SwinConfig
# 初始化函数,接受 SwinConfig 类型的 config 参数,以及其他可选参数
def __init__(
self, config: SwinConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
) -> None:
# 调用父类的初始化方法
super().__init__(**kwargs)
# 将传入的配置参数 config 赋值给对象的 config 属性
self.config = config
# 计算层数,即配置的深度列表的长度
self.num_layers = len(config.depths)
# 计算特征数,为配置中的嵌入维度乘以 2 的 (层数 - 1) 次方
self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
# 创建 TFSwinEmbeddings 对象,并赋值给 embeddings 属性
self.embeddings = TFSwinEmbeddings(config, use_mask_token=use_mask_token, name="embeddings")
# 创建 TFSwinEncoder 对象,并传入 patch_grid 参数和名称 "encoder",赋值给 encoder 属性
self.encoder = TFSwinEncoder(config, self.embeddings.patch_grid, name="encoder")
# 创建 LayerNormalization 层,epsilon 参数为配置中的层归一化 epsilon 值,名称为 "layernorm",赋值给 layernorm 属性
self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
# 如果 add_pooling_layer 为 True,则创建 AdaptiveAveragePooling1D 层,输出大小为 (1,),赋值给 pooler 属性;否则 pooler 属性为 None
self.pooler = AdaptiveAveragePooling1D(output_size=(1,)) if add_pooling_layer else None
# 获取输入嵌入的方法,返回 embeddings 对象的 patch_embeddings 属性
def get_input_embeddings(self) -> TFSwinPatchEmbeddings:
return self.embeddings.patch_embeddings
# 模型头部修剪方法,接受 heads_to_prune 参数,用于剪枝模型中的注意力头
def _prune_heads(self, heads_to_prune: Dict[int, List]):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 遍历 heads_to_prune 字典中的每一层和对应要剪枝的注意力头列表
for layer, heads in heads_to_prune.items():
# 在编码器(self.encoder)的指定层(layer)的注意力部分(attention)进行头部剪枝操作
self.encoder.layer[layer].attention.prune_heads(heads)
# 获取头部掩码的方法,接受 head_mask 参数,如果非空则抛出未实现错误,否则返回与深度列表长度相同的 None 列表
def get_head_mask(self, head_mask: Optional[Any]) -> List:
if head_mask is not None:
raise NotImplementedError
return [None] * len(self.config.depths)
# 调用方法,接受多个参数并进行处理,包括像素值、掩码位置、头部掩码等
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor | None = None,
bool_masked_pos: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]:
# 如果未指定,则根据配置确定是否输出注意力权重
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果未指定,则根据配置确定是否输出隐藏状态
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果未指定,则根据配置确定是否返回字典格式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果像素值为空,则抛出数值错误异常
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 准备头部掩码(如果需要)
# head_mask 中的 1.0 表示保留对应的注意力头部
# attention_probs 的形状为 bsz x n_heads x N x N
# 输入的 head_mask 形状为 [num_heads] 或者 [num_hidden_layers x num_heads]
# head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask)
# 将像素值传入嵌入层,并获取嵌入层的输出和输入维度
embedding_output, input_dimensions = self.embeddings(
pixel_values, bool_masked_pos=bool_masked_pos, training=training
)
# 将嵌入层的输出传入编码器,并返回编码器的输出
encoder_outputs = self.encoder(
embedding_output,
input_dimensions,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取编码器的序列输出,并进行 layer normalization
sequence_output = encoder_outputs[0]
sequence_output = self.layernorm(sequence_output, training=training)
# 初始化池化输出为 None
pooled_output = None
# 如果池化器不为空,则对序列输出进行池化
if self.pooler is not None:
batch_size, _, num_features = shape_list(sequence_output)
pooled_output = self.pooler(sequence_output)
pooled_output = tf.reshape(pooled_output, (batch_size, num_features))
# 如果不需要返回字典,则返回输出元组
if not return_dict:
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
# 如果需要返回字典格式的输出,则构建 TFSwinModelOutput 对象
return TFSwinModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
def build(self, input_shape=None):
# 如果已经构建过,则直接返回
if self.built:
return
# 标记已经构建
self.built = True
# 如果存在嵌入层,则构建嵌入层
if getattr(self, "embeddings", None) is not None:
with tf.name_scope(self.embeddings.name):
self.embeddings.build(None)
# 如果存在编码器,则构建编码器
if getattr(self, "encoder", None) is not None:
with tf.name_scope(self.encoder.name):
self.encoder.build(None)
# 如果存在层归一化,则构建层归一化
if getattr(self, "layernorm", None) is not None:
with tf.name_scope(self.layernorm.name):
self.layernorm.build([None, None, self.num_features])
# 使用装饰器为类添加文档字符串,描述其作为裸的 Swin 模型变换器,输出未经任何特定头部处理的原始隐藏状态
@add_start_docstrings(
"The bare Swin Model transformer outputting raw hidden-states without any specific head on top.",
SWIN_START_DOCSTRING,
)
# 定义 TFSwinModel 类,继承自 TFSwinPreTrainedModel
class TFSwinModel(TFSwinPreTrainedModel):
# 初始化方法
def __init__(
self, config: SwinConfig, add_pooling_layer: bool = True, use_mask_token: bool = False, **kwargs
) -> None:
# 调用父类的初始化方法
super().__init__(config, **kwargs)
# 保存配置信息到实例变量
self.config = config
# 创建 TFSwinMainLayer 的实例 swin,并命名为 "swin"
self.swin = TFSwinMainLayer(config, name="swin")
# 为 call 方法添加文档字符串,描述其作为模型前向传播的入口点,使用 SWIN_INPUTS_DOCSTRING 作为输入文档字符串
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
# 使用装饰器添加代码示例文档字符串,展示模型的使用示例
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=TFSwinModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
# 使用装饰器解包输入,确保正确处理输入参数
@unpack_inputs
# 定义 call 方法,接收多个参数并返回 TFSwinModelOutput 或 tf.Tensor 元组
def call(
self,
pixel_values: tf.Tensor | None = None,
bool_masked_pos: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]:
r"""
bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
# 根据需要确定是否输出注意力权重,默认使用配置中的设置
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 根据需要确定是否输出隐藏状态,默认使用配置中的设置
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 根据需要确定是否返回字典形式的输出,默认使用配置中的设置
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果未提供像素值,则引发值错误
if pixel_values is None:
raise ValueError("You have to specify pixel_values")
# 调用 self.swin 的前向传播方法,传递所有参数,并获取模型输出
swin_outputs = self.swin(
pixel_values=pixel_values,
bool_masked_pos=bool_masked_pos,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 返回模型输出
return swin_outputs
# 实现 build 方法,用于构建模型层次结构
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
# 标记为已构建
self.built = True
# 如果 self.swin 已存在,则在命名空间下构建 self.swin
if getattr(self, "swin", None) is not None:
with tf.name_scope(self.swin.name):
self.swin.build(None)
# 定义 TFSwinPixelShuffle 类,继承自 keras.layers.Layer,实现了 torch.nn.PixelShuffle 的 TensorFlow 版本的层
class TFSwinPixelShuffle(keras.layers.Layer):
"""TF layer implementation of torch.nn.PixelShuffle"""
# 初始化方法
def __init__(self, upscale_factor: int, **kwargs) -> None:
# 调用父类的初始化方法
super().__init__(**kwargs)
# 如果 upscale_factor 不是整数或小于 2,则引发值错误
if not isinstance(upscale_factor, int) or upscale_factor < 2:
raise ValueError(f"upscale_factor must be an integer value >= 2 got {upscale_factor}")
# 保存 upscale_factor 到实例变量
self.upscale_factor = upscale_factor
# 定义一个方法,接受一个张量 x 作为输入,返回一个张量作为输出
def call(self, x: tf.Tensor) -> tf.Tensor:
# 将输入张量赋值给 hidden_states
hidden_states = x
# 调用 shape_list 函数获取 hidden_states 的形状信息,并解包得到 batch_size, _, _, num_input_channels
batch_size, _, _, num_input_channels = shape_list(hidden_states)
# 计算块大小的平方
block_size_squared = self.upscale_factor**2
# 计算输出深度,即 num_input_channels 除以块大小的平方后取整
output_depth = int(num_input_channels / block_size_squared)
# 创建一个常量张量 permutation,用于存储一个通道排列顺序的索引
permutation = tf.constant(
# 使用列表推导式生成的二维数组,每个元素是一个索引,按照不同通道和块的顺序排列
[[i + j * block_size_squared for i in range(block_size_squared) for j in range(output_depth)]]
)
# 使用 tf.gather 函数根据 permutation 中的索引重新组织 hidden_states 的通道
hidden_states = tf.gather(params=hidden_states, indices=tf.tile(permutation, [batch_size, 1]), batch_dims=-1)
# 使用 tf.nn.depth_to_space 函数进行深度到空间的转换,根据 upscale_factor 参数进行块的重新排列
hidden_states = tf.nn.depth_to_space(hidden_states, block_size=self.upscale_factor, data_format="NHWC")
# 返回处理后的 hidden_states 作为结果
return hidden_states
# 自定义的 TensorFlow 2.x 模型层,用于实现 TFSwin 模型的解码器部分
class TFSwinDecoder(keras.layers.Layer):
def __init__(self, config: SwinConfig, **kwargs):
super().__init__(**kwargs)
# 定义一个 1x1 卷积层,用于特征变换
self.conv2d = keras.layers.Conv2D(
filters=config.encoder_stride**2 * config.num_channels, kernel_size=1, strides=1, name="0"
)
# 像素重排层,用于反向像素重排
self.pixel_shuffle = TFSwinPixelShuffle(config.encoder_stride, name="1")
# 保存 Swin 模型的配置信息
self.config = config
def call(self, x: tf.Tensor) -> tf.Tensor:
# 将输入张量从 B,C,H,W 转置为 B,H,W,C
hidden_states = x
hidden_states = tf.transpose(hidden_states, (0, 2, 3, 1))
# 经过 1x1 卷积层变换
hidden_states = self.conv2d(hidden_states)
# 经过像素重排层
hidden_states = self.pixel_shuffle(hidden_states)
# 将输出张量从 B,H,W,C 转置为 B,C,H,W
hidden_states = tf.transpose(hidden_states, (0, 3, 1, 2))
return hidden_states
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
self.built = True
# 构建卷积层
if getattr(self, "conv2d", None) is not None:
with tf.name_scope(self.conv2d.name):
self.conv2d.build([None, None, None, self.config.hidden_size])
# 构建像素重排层
if getattr(self, "pixel_shuffle", None) is not None:
with tf.name_scope(self.pixel_shuffle.name):
self.pixel_shuffle.build(None)
# 基于 Swin 模型的一个变体,用于处理带掩码的图像建模,参考 SimMIM 论文提出的方法
@add_start_docstrings(
"Swin Model with a decoder on top for masked image modeling, as proposed in"
" [SimMIM](https://arxiv.org/abs/2111.09886).",
SWIN_START_DOCSTRING,
)
class TFSwinForMaskedImageModeling(TFSwinPreTrainedModel):
def __init__(self, config: SwinConfig):
super().__init__(config)
# Swin 主层,不包含池化层,使用掩码标记
self.swin = TFSwinMainLayer(config, add_pooling_layer=False, use_mask_token=True, name="swin")
# Swin 解码器层
self.decoder = TFSwinDecoder(config, name="decoder")
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=TFSwinMaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor | None = None,
bool_masked_pos: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
):
# 略
pass
def build(self, input_shape=None):
# 如果已经构建过,直接返回
if self.built:
return
self.built = True
# 构建 Swin 主层
if getattr(self, "swin", None) is not None:
with tf.name_scope(self.swin.name):
self.swin.build(None)
# 构建 Swin 解码器层
if getattr(self, "decoder", None) is not None:
with tf.name_scope(self.decoder.name):
self.decoder.build(None)
# Swin 模型的图像分类变体,顶部附加了一个分类头部的线性层(在 [CLS] 标记的最终隐藏状态之上),例如用于 ImageNet
@add_start_docstrings(
"""
Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
the [CLS] token) e.g. for ImageNet.
""",
SWIN_START_DOCSTRING,
)
class TFSwinForImageClassification(TFSwinPreTrainedModel, TFSequenceClassificationLoss):
# 略
pass
# 初始化函数,接受一个 SwinConfig 类型的配置对象作为参数
def __init__(self, config: SwinConfig):
# 调用父类的初始化方法
super().__init__(config)
# 设置类的属性,表示分类数目
self.num_labels = config.num_labels
# 创建一个 TFSwinMainLayer 类的实例,命名为 "swin"
self.swin = TFSwinMainLayer(config, name="swin")
# 分类器头部
# 如果配置的标签数目大于 0,则创建一个全连接层作为分类器
# 否则创建一个线性激活层作为分类器
self.classifier = (
keras.layers.Dense(config.num_labels, name="classifier")
if config.num_labels > 0
else keras.layers.Activation("linear", name="classifier")
)
# 根据装饰器提供的文档字符串,定义了模型前向传播的方法
@add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=TFSwinImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
@unpack_inputs
def call(
self,
pixel_values: tf.Tensor | None = None,
head_mask: tf.Tensor | None = None,
labels: tf.Tensor | None = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
training: bool = False,
) -> Union[Tuple[tf.Tensor, ...], TFSwinImageClassifierOutput]:
"""
labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确定是否返回字典类型的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用 Swin 模型的前向传播方法
outputs = self.swin(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
training=training,
)
# 获取池化后的输出
pooled_output = outputs[1]
# 将池化输出传递给分类器进行预测
logits = self.classifier(pooled_output, training=training)
# 如果有提供标签,则计算损失
loss = None if labels is None else self.hf_compute_loss(labels, logits)
# 如果不要求返回字典类型的输出,则按需返回输出的元组
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 否则返回 TFSwinImageClassifierOutput 类型的对象
return TFSwinImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
# 构建模型,设置模型的输入形状
def build(self, input_shape=None):
# 如果模型已经构建过,则直接返回
if self.built:
return
# 标记模型已经构建
self.built = True
# 如果存在 Swin 层,则在其命名空间下构建 Swin 层
if getattr(self, "swin", None) is not None:
with tf.name_scope(self.swin.name):
self.swin.build(None)
# 如果存在分类器,则在其命名空间下构建分类器,并传入 Swin 特征数目作为输入形状的一部分
if getattr(self, "classifier", None) is not None:
if hasattr(self.classifier, "name"):
with tf.name_scope(self.classifier.name):
self.classifier.build([None, None, self.swin.num_features])
.\models\swin\__init__.py
# 引入类型检查的模块
from typing import TYPE_CHECKING
# 引入异常类,用于处理可选依赖不可用的情况
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
# 定义模块的导入结构,包含配置和模型相关的导入信息
_import_structure = {"configuration_swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig", "SwinOnnxConfig"]}
# 检查是否有torch可用,若不可用则抛出OptionalDependencyNotAvailable异常
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 若torch可用,则添加相关的模型定义到_import_structure中
_import_structure["modeling_swin"] = [
"SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
"SwinForImageClassification",
"SwinForMaskedImageModeling",
"SwinModel",
"SwinPreTrainedModel",
"SwinBackbone",
]
# 检查是否有tensorflow可用,若不可用则抛出OptionalDependencyNotAvailable异常
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 若tensorflow可用,则添加相关的tensorflow模型定义到_import_structure中
_import_structure["modeling_tf_swin"] = [
"TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
"TFSwinForImageClassification",
"TFSwinForMaskedImageModeling",
"TFSwinModel",
"TFSwinPreTrainedModel",
]
# 如果当前是类型检查阶段
if TYPE_CHECKING:
# 从配置模块中导入特定的配置类和常量
from .configuration_swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig, SwinOnnxConfig
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 从模型定义模块中导入特定的torch模型类
from .modeling_swin import (
SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
SwinBackbone,
SwinForImageClassification,
SwinForMaskedImageModeling,
SwinModel,
SwinPreTrainedModel,
)
try:
if not is_tf_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
pass
else:
# 从tensorflow模型定义模块中导入特定的tensorflow模型类
from .modeling_tf_swin import (
TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
TFSwinForImageClassification,
TFSwinForMaskedImageModeling,
TFSwinModel,
TFSwinPreTrainedModel,
)
# 如果不是类型检查阶段,则执行延迟模块加载的逻辑
else:
import sys
# 将当前模块替换为LazyModule,以实现延迟导入
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\swin2sr\configuration_swin2sr.py
# 设置文件编码为 UTF-8
# 版权声明,版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本使用本文件,除非符合许可证,否则不得使用此文件
# 您可以在以下网址获取许可证副本:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则按"原样"分发本软件
# 本软件没有任何明示或暗示的保证或条件
# 详细信息请参阅许可证
""" Swin2SR Transformer model configuration"""
# 导入预训练配置类 PretrainedConfig
from ...configuration_utils import PretrainedConfig
# 导入日志记录工具
from ...utils import logging
# 获取名为 __name__ 的日志记录器
logger = logging.get_logger(__name__)
# Swin2SR 预训练配置映射表,包含了模型名称及其配置文件的 URL
SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"caidas/swin2sr-classicalsr-x2-64": (
"https://huggingface.co/caidas/swin2sr-classicalsr-x2-64/resolve/main/config.json"
),
}
# Swin2SRConfig 类,用于存储 Swin2SRModel 的配置
class Swin2SRConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Swin2SRModel`]. It is used to instantiate a Swin
Transformer v2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Swin Transformer v2
[caidas/swin2sr-classicalsr-x2-64](https://huggingface.co/caidas/swin2sr-classicalsr-x2-64) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import Swin2SRConfig, Swin2SRModel
>>> # Initializing a Swin2SR caidas/swin2sr-classicalsr-x2-64 style configuration
>>> configuration = Swin2SRConfig()
>>> # Initializing a model (with random weights) from the caidas/swin2sr-classicalsr-x2-64 style configuration
>>> model = Swin2SRModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
# 模型类型为 "swin2sr"
model_type = "swin2sr"
# 属性映射,将类的属性名映射到预训练模型配置中的参数名
attribute_map = {
"hidden_size": "embed_dim",
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
# Swin2SRConfig 类的构造函数,定义了 Swin2SR 模型的各种配置参数
def __init__(
self,
image_size=64,
patch_size=1,
num_channels=3,
num_channels_out=None,
embed_dim=180,
depths=[6, 6, 6, 6, 6, 6],
num_heads=[6, 6, 6, 6, 6, 6],
window_size=8,
mlp_ratio=2.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
use_absolute_embeddings=False,
initializer_range=0.02,
layer_norm_eps=1e-5,
upscale=2,
img_range=1.0,
resi_connection="1conv",
upsampler="pixelshuffle",
**kwargs,
# 调用父类的初始化方法,传入所有关键字参数
super().__init__(**kwargs)
# 设置模型的图像大小
self.image_size = image_size
# 设置每个图像块的大小
self.patch_size = patch_size
# 输入图像的通道数
self.num_channels = num_channels
# 输出图像的通道数,默认与输入通道数相同
self.num_channels_out = num_channels if num_channels_out is None else num_channels_out
# 嵌入维度
self.embed_dim = embed_dim
# 注意力层的深度列表
self.depths = depths
# 注意力层的数量,即深度列表的长度
self.num_layers = len(depths)
# 头部的数量
self.num_heads = num_heads
# 窗口大小
self.window_size = window_size
# MLP(多层感知机)扩展比例
self.mlp_ratio = mlp_ratio
# 是否使用查询、键、值的偏置
self.qkv_bias = qkv_bias
# 隐藏层的dropout率
self.hidden_dropout_prob = hidden_dropout_prob
# 注意力概率的dropout率
self.attention_probs_dropout_prob = attention_probs_dropout_prob
# 路径丢弃率
self.drop_path_rate = drop_path_rate
# 隐藏层的激活函数类型
self.hidden_act = hidden_act
# 是否使用绝对位置嵌入
self.use_absolute_embeddings = use_absolute_embeddings
# 层归一化的epsilon值
self.layer_norm_eps = layer_norm_eps
# 初始化范围
self.initializer_range = initializer_range
# 是否进行上采样
self.upscale = upscale
# 图像的像素范围
self.img_range = img_range
# 是否使用残差连接
self.resi_connection = resi_connection
# 上采样器
self.upsampler = upsampler
.\models\swin2sr\convert_swin2sr_original_to_pytorch.py
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Swin2SR checkpoints from the original repository. URL: https://github.com/mv-lab/swin2sr"""
import argparse # 导入解析命令行参数的模块
import requests # 导入发送 HTTP 请求的模块
import torch # 导入 PyTorch 深度学习框架
from PIL import Image # 导入处理图像的模块
from torchvision.transforms import Compose, Normalize, Resize, ToTensor # 导入图像转换相关模块
from transformers import Swin2SRConfig, Swin2SRForImageSuperResolution, Swin2SRImageProcessor # 导入转换器相关模块
def get_config(checkpoint_url):
config = Swin2SRConfig() # 创建 Swin2SRConfig 的实例
if "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
config.upscale = 4 # 设置放大倍数为4
elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
config.upscale = 4 # 设置放大倍数为4
config.image_size = 48 # 设置图像尺寸为48
config.upsampler = "pixelshuffle_aux" # 设置上采样方法为 pixelshuffle_aux
elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
config.depths = [6, 6, 6, 6] # 设置深度参数列表
config.embed_dim = 60 # 设置嵌入维度为60
config.num_heads = [6, 6, 6, 6] # 设置注意力头数列表
config.upsampler = "pixelshuffledirect" # 设置上采样方法为 pixelshuffledirect
elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
config.upscale = 4 # 设置放大倍数为4
config.upsampler = "nearest+conv" # 设置上采样方法为 nearest+conv
elif "Swin2SR_Jpeg_dynamic" in checkpoint_url:
config.num_channels = 1 # 设置通道数为1
config.upscale = 1 # 设置放大倍数为1
config.image_size = 126 # 设置图像尺寸为126
config.window_size = 7 # 设置窗口大小为7
config.img_range = 255.0 # 设置图像像素范围为255.0
config.upsampler = "" # 设置上采样方法为空字符串
return config # 返回配置对象
def rename_key(name, config):
if "patch_embed.proj" in name and "layers" not in name:
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
if "patch_embed.norm" in name:
name = name.replace("patch_embed.norm", "embeddings.patch_embeddings.layernorm")
if "layers" in name:
name = name.replace("layers", "encoder.stages")
if "residual_group.blocks" in name:
name = name.replace("residual_group.blocks", "layers")
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
if "attn" in name:
name = name.replace("attn", "attention.self")
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
if "q_bias" in name:
name = name.replace("q_bias", "query.bias")
if "k_bias" in name:
name = name.replace("k_bias", "key.bias")
# 如果变量名中包含 "v_bias",则替换为 "value.bias"
if "v_bias" in name:
name = name.replace("v_bias", "value.bias")
# 如果变量名中包含 "cpb_mlp",则替换为 "continuous_position_bias_mlp"
if "cpb_mlp" in name:
name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
# 如果变量名中包含 "patch_embed.proj",则替换为 "patch_embed.projection"
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "patch_embed.projection")
# 如果变量名为 "norm.weight",则替换为 "layernorm.weight"
if name == "norm.weight":
name = "layernorm.weight"
# 如果变量名为 "norm.bias",则替换为 "layernorm.bias"
if name == "norm.bias":
name = "layernorm.bias"
# 如果变量名中包含 "conv_first",则替换为 "first_convolution"
if "conv_first" in name:
name = name.replace("conv_first", "first_convolution")
# 如果变量名中包含以下任意一个字符串,将其替换为相应的名称或前缀
if (
"upsample" in name
or "conv_before_upsample" in name
or "conv_bicubic" in name
or "conv_up" in name
or "conv_hr" in name
or "conv_last" in name
or "aux" in name
):
# 对于特定的字符串替换规则
if "conv_last" in name:
name = name.replace("conv_last", "final_convolution")
# 根据 config.upsampler 的不同取值进行不同的替换
if config.upsampler in ["pixelshuffle", "pixelshuffle_aux", "nearest+conv"]:
if "conv_before_upsample.0" in name:
name = name.replace("conv_before_upsample.0", "conv_before_upsample")
if "upsample.0" in name:
name = name.replace("upsample.0", "upsample.convolution_0")
if "upsample.2" in name:
name = name.replace("upsample.2", "upsample.convolution_1")
# 统一添加前缀 "upsample."
name = "upsample." + name
elif config.upsampler == "pixelshuffledirect":
# 特定替换规则
name = name.replace("upsample.0.weight", "upsample.conv.weight")
name = name.replace("upsample.0.bias", "upsample.conv.bias")
else:
pass
else:
# 如果不符合以上任何替换条件,则添加前缀 "swin2sr."
name = "swin2sr." + name
# 返回处理后的变量名
return name
# 转换给定的原始状态字典,根据配置更新键名
def convert_state_dict(orig_state_dict, config):
# 遍历原始状态字典的复制键列表
for key in orig_state_dict.copy().keys():
# 弹出当前键对应的值
val = orig_state_dict.pop(key)
# 如果键名包含"qkv"
if "qkv" in key:
# 拆分键名以获取阶段号、块号和维度
key_split = key.split(".")
stage_num = int(key_split[1])
block_num = int(key_split[4])
dim = config.embed_dim
# 如果键名中包含"weight"
if "weight" in key:
# 更新查询权重、键权重和值权重的新键名和对应的值
orig_state_dict[
f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.weight"
] = val[:dim, :]
orig_state_dict[
f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.weight"
] = val[dim : dim * 2, :]
orig_state_dict[
f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.weight"
] = val[-dim:, :]
else:
# 更新查询偏置、键偏置和值偏置的新键名和对应的值
orig_state_dict[
f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.query.bias"
] = val[:dim]
orig_state_dict[
f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.key.bias"
] = val[dim : dim * 2]
orig_state_dict[
f"swin2sr.encoder.stages.{stage_num}.layers.{block_num}.attention.self.value.bias"
] = val[-dim:]
pass
else:
# 对于其他键,使用配置中的重命名函数处理键名,并更新原始状态字典
orig_state_dict[rename_key(key, config)] = val
# 返回更新后的原始状态字典
return orig_state_dict
def convert_swin2sr_checkpoint(checkpoint_url, pytorch_dump_folder_path, push_to_hub):
# 获取模型配置
config = get_config(checkpoint_url)
# 根据配置创建模型实例
model = Swin2SRForImageSuperResolution(config)
# 将模型设置为评估模式
model.eval()
# 从给定的 URL 加载模型状态字典到本地
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
# 使用转换函数将状态字典转换为适用于当前模型的新状态字典
new_state_dict = convert_state_dict(state_dict, config)
# 加载新的状态字典到模型中,并获取缺失键和意外键
missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
# 如果存在缺失键,抛出值错误
if len(missing_keys) > 0:
raise ValueError("Missing keys when converting: {}".format(missing_keys))
# 对于每个意外的键,如果不包含指定的子字符串,则抛出值错误
for key in unexpected_keys:
if not ("relative_position_index" in key or "relative_coords_table" in key or "self_mask" in key):
raise ValueError(f"Unexpected key {key} in state_dict")
# 验证加载的图像 URL
url = "https://github.com/mv-lab/swin2sr/blob/main/testsets/real-inputs/shanghai.jpg?raw=true"
# 使用请求获取并打开图像,并将其转换为 RGB 模式
image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
# 创建图像处理器实例
processor = Swin2SRImageProcessor()
# 根据模型类型设置图像大小
image_size = 126 if "Jpeg" in checkpoint_url else 256
# 定义图像转换步骤,包括调整大小、转换为张量和归一化处理
transforms = Compose(
[
Resize((image_size, image_size)),
ToTensor(),
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
]
)
# 对图像应用转换步骤,并扩展维度以匹配模型输入
pixel_values = transforms(image).unsqueeze(0)
# 如果配置中指定的通道数为 1,只保留第一个通道的像素值
if config.num_channels == 1:
pixel_values = pixel_values[:, 0, :, :].unsqueeze(1)
# 使用模型对输入像素值进行推理,得到输出结果
outputs = model(pixel_values)
# 根据不同的 checkpoint_url 设置预期的输出形状和切片
if "Swin2SR_ClassicalSR_X2_64" in checkpoint_url:
expected_shape = torch.Size([1, 3, 512, 512])
expected_slice = torch.tensor(
[[-0.7087, -0.7138, -0.6721], [-0.8340, -0.8095, -0.7298], [-0.9149, -0.8414, -0.7940]]
)
elif "Swin2SR_ClassicalSR_X4_64" in checkpoint_url:
expected_shape = torch.Size([1, 3, 1024, 1024])
expected_slice = torch.tensor(
[[-0.7775, -0.8105, -0.8933], [-0.7764, -0.8356, -0.9225], [-0.7976, -0.8686, -0.9579]]
)
elif "Swin2SR_CompressedSR_X4_48" in checkpoint_url:
# TODO 值在这里并不完全匹配
expected_shape = torch.Size([1, 3, 1024, 1024])
expected_slice = torch.tensor(
[[-0.8035, -0.7504, -0.7491], [-0.8538, -0.8124, -0.7782], [-0.8804, -0.8651, -0.8493]]
)
elif "Swin2SR_Lightweight_X2_64" in checkpoint_url:
expected_shape = torch.Size([1, 3, 512, 512])
expected_slice = torch.tensor(
[[-0.7669, -0.8662, -0.8767], [-0.8810, -0.9962, -0.9820], [-0.9340, -1.0322, -1.1149]]
)
elif "Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR" in checkpoint_url:
expected_shape = torch.Size([1, 3, 1024, 1024])
expected_slice = torch.tensor(
[[-0.5238, -0.5557, -0.6321], [-0.6016, -0.5903, -0.6391], [-0.6244, -0.6334, -0.6889]]
)
# 断言输出重建的形状是否与预期一致
assert (
outputs.reconstruction.shape == expected_shape
), f"Shape of reconstruction should be {expected_shape}, but is {outputs.reconstruction.shape}"
# 断言输出重建的部分数据是否与预期一致,容差为 1e-3
assert torch.allclose(outputs.reconstruction[0, 0, :3, :3], expected_slice, atol=1e-3)
# 打印提示信息,表明检查通过
print("Looks ok!")
# 将 checkpoint_url 映射到模型名称的字典
url_to_name = {
"https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth": (
"swin2SR-classical-sr-x2-64"
),
"https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X4_64.pth": (
"swin2SR-classical-sr-x4-64"
),
"https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_CompressedSR_X4_48.pth": (
"swin2SR-compressed-sr-x4-48"
),
"https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_Lightweight_X2_64.pth": (
"swin2SR-lightweight-x2-64"
),
"https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_RealworldSR_X4_64_BSRGAN_PSNR.pth": (
"swin2SR-realworld-sr-x4-64-bsrgan-psnr"
),
}
# 根据 checkpoint_url 获取模型名称
model_name = url_to_name[checkpoint_url]
# 如果指定了 pytorch_dump_folder_path,保存模型和处理器到该路径
if pytorch_dump_folder_path is not None:
print(f"Saving model {model_name} to {pytorch_dump_folder_path}")
model.save_pretrained(pytorch_dump_folder_path)
print(f"Saving image processor to {pytorch_dump_folder_path}")
processor.save_pretrained(pytorch_dump_folder_path)
# 如果设置了 push_to_hub 标志,将模型和处理器推送到 Hub
if push_to_hub:
model.push_to_hub(f"caidas/{model_name}")
processor.push_to_hub(f"caidas/{model_name}")
if __name__ == "__main__":
# 如果当前脚本作为主程序运行,则执行以下代码块
parser = argparse.ArgumentParser()
# 创建参数解析器对象
# Required parameters
# 必需的参数设定
parser.add_argument(
"--checkpoint_url",
default="https://github.com/mv-lab/swin2sr/releases/download/v0.0.1/Swin2SR_ClassicalSR_X2_64.pth",
type=str,
help="URL of the original Swin2SR checkpoint you'd like to convert.",
)
# 添加名为 "checkpoint_url" 的参数,设定默认值为 Swin2SR 模型的下载地址,类型为字符串,帮助信息指定用途
parser.add_argument(
"--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
)
# 添加名为 "pytorch_dump_folder_path" 的参数,设定默认值为 None,类型为字符串,帮助信息指定输出 PyTorch 模型的目录路径
parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the converted model to the hub.")
# 添加名为 "push_to_hub" 的参数,设定为布尔类型,表示是否将转换后的模型推送到模型中心(hub)
args = parser.parse_args()
# 解析命令行参数,并将结果存储在 args 对象中
convert_swin2sr_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.push_to_hub)
# 调用函数 convert_swin2sr_checkpoint,传递解析后的参数对象 args 的相应属性作为函数参数
.\models\swin2sr\image_processing_swin2sr.py
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Image processor class for Swin2SR."""
from typing import Optional, Union
import numpy as np
# 导入基础的图像处理工具和转换函数
from ...image_processing_utils import BaseImageProcessor, BatchFeature
from ...image_transforms import get_image_size, pad, to_channel_dimension_format
from ...image_utils import (
ChannelDimension,
ImageInput,
infer_channel_dimension_format,
is_scaled_image,
make_list_of_images,
to_numpy_array,
valid_images,
validate_kwargs,
validate_preprocess_arguments,
)
# 导入日志记录工具
from ...utils import TensorType, logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)
# 定义图像处理器类 Swin2SRImageProcessor,继承自 BaseImageProcessor
class Swin2SRImageProcessor(BaseImageProcessor):
r"""
Constructs a Swin2SR image processor.
Args:
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
parameter in the `preprocess` method.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
`preprocess` method.
"""
# 模型输入名称列表
model_input_names = ["pixel_values"]
# 初始化方法
def __init__(
self,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_pad: bool = True,
pad_size: int = 8,
**kwargs,
) -> None:
# 调用父类的初始化方法
super().__init__(**kwargs)
# 初始化各参数
self.do_rescale = do_rescale # 是否进行图像缩放
self.rescale_factor = rescale_factor # 缩放因子,默认为 1/255
self.do_pad = do_pad # 是否进行填充
self.pad_size = pad_size # 填充尺寸
self._valid_processor_keys = [ # 可接受的处理器关键字列表
"images",
"do_rescale",
"rescale_factor",
"do_pad",
"pad_size",
"return_tensors",
"data_format",
"input_data_format",
]
# 图像填充方法
def pad(
self,
image: np.ndarray,
size: int,
data_format: Optional[Union[str, ChannelDimension]] = None,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
# 实现图像填充操作,接受以下参数:
# - image: 待填充的图像数组
# - size: 填充尺寸
# - data_format: 输出数据格式(通道维度格式),可选
# - input_data_format: 输入数据格式(通道维度格式),可选
):
"""
Pad an image to make the height and width divisible by `size`.
Args:
image (`np.ndarray`):
Image to pad.
size (`int`):
The size to make the height and width divisible by.
data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the output image. If unset, the channel dimension format of the input
image is used. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
input_data_format (`str` or `ChannelDimension`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
Returns:
`np.ndarray`: The padded image.
"""
# 获取输入图像的原始高度和宽度
old_height, old_width = get_image_size(image, input_data_format)
# 计算需要填充的高度和宽度
pad_height = (old_height // size + 1) * size - old_height
pad_width = (old_width // size + 1) * size - old_width
# 调用 pad 函数进行填充操作
return pad(
image,
((0, pad_height), (0, pad_width)), # 在高度和宽度两个维度上进行填充
mode="symmetric", # 使用对称模式进行填充
data_format=data_format, # 指定输出图像的通道维度格式
input_data_format=input_data_format, # 指定输入图像的通道维度格式
)
def preprocess(
self,
images: ImageInput,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_pad: Optional[bool] = None,
pad_size: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
**kwargs,
.\models\swin2sr\modeling_swin2sr.py
# 设置文件编码为 UTF-8
# 版权声明,版权归 Microsoft Research 和 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本使用本文件
# 除非符合许可证规定,否则不得使用本文件
# 您可以在以下网址获取许可证的副本:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,本软件是基于“原样”提供的,不提供任何明示或暗示的保证或条件
# 请参阅许可证获取更多信息
""" PyTorch Swin2SR Transformer model."""
# 导入必要的库和模块
import collections.abc # 导入 collections.abc 模块
import math # 导入 math 模块
from dataclasses import dataclass # 从 dataclasses 模块导入 dataclass 装饰器
from typing import Optional, Tuple, Union # 导入类型提示的相关类和类型
import torch # 导入 PyTorch 库
import torch.utils.checkpoint # 导入 PyTorch 的 checkpoint 工具
from torch import nn # 从 PyTorch 导入 nn 模块
# 导入模型相关的子模块和函数
from ...activations import ACT2FN # 从 ...activations 模块导入 ACT2FN 函数
from ...modeling_outputs import BaseModelOutput, ImageSuperResolutionOutput # 从 ...modeling_outputs 模块导入输出类
from ...modeling_utils import PreTrainedModel # 从 ...modeling_utils 模块导入预训练模型相关类
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer # 从 ...pytorch_utils 导入相关工具函数
from ...utils import (
ModelOutput, # 从 ...utils 模块导入 ModelOutput 类
add_code_sample_docstrings, # 从 ...utils 模块导入相关函数和类
add_start_docstrings, # 从 ...utils 模块导入相关函数和类
add_start_docstrings_to_model_forward, # 从 ...utils 模块导入相关函数和类
logging, # 从 ...utils 模块导入 logging 模块
replace_return_docstrings, # 从 ...utils 模块导入相关函数
)
# 导入 Swin2SR 的配置类
from .configuration_swin2sr import Swin2SRConfig # 从当前目录下的 configuration_swin2sr 模块导入 Swin2SRConfig 类
# 获取日志记录器
logger = logging.get_logger(__name__)
# 用于文档的一般信息
_CONFIG_FOR_DOC = "Swin2SRConfig"
# 用于文档的基本检查点信息
_CHECKPOINT_FOR_DOC = "caidas/swin2SR-classical-sr-x2-64"
# 预期的输出形状
_EXPECTED_OUTPUT_SHAPE = [1, 180, 488, 648]
# Swin2SR 预训练模型存档列表
SWIN2SR_PRETRAINED_MODEL_ARCHIVE_LIST = [
"caidas/swin2SR-classical-sr-x2-64",
# 查看所有 Swin2SR 模型,请访问 https://huggingface.co/models?filter=swin2sr
]
@dataclass
class Swin2SREncoderOutput(ModelOutput):
"""
Swin2SR 编码器的输出,可能包含隐藏状态和注意力权重。
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列输出。
hidden_states (`tuple(torch.FloatTensor)`, *可选*, 当 `output_hidden_states=True` 传递或当 `config.output_hidden_states=True` 时返回):
模型每层的隐藏状态的元组,包括初始嵌入的输出。
模型每层的隐藏状态以及初始嵌入的输出。
attentions (`tuple(torch.FloatTensor)`, *可选*, 当 `output_attentions=True` 传递或当 `config.output_attentions=True` 时返回):
模型每阶段的注意力权重的元组。
注意力 softmax 后的注意力权重,用于计算自注意力头中的加权平均值。
"""
# 声明一个变量 last_hidden_state,类型为 torch.FloatTensor,初始值为 None
last_hidden_state: torch.FloatTensor = None
# 声明一个变量 hidden_states,类型为可选的元组,元素类型为 torch.FloatTensor
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
# 声明一个变量 attentions,类型为可选的元组,元素类型为 torch.FloatTensor
attentions: Optional[Tuple[torch.FloatTensor]] = None
# Copied from transformers.models.swin.modeling_swin.window_partition
def window_partition(input_feature, window_size):
"""
Partitions the given input into windows.
"""
# 获取输入特征的尺寸信息:批量大小、高度、宽度、通道数
batch_size, height, width, num_channels = input_feature.shape
# 将输入特征按窗口大小进行划分,重塑为新的形状
input_feature = input_feature.view(
batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
)
# 对划分后的窗口进行重新排序,以便后续处理
windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
return windows
# Copied from transformers.models.swin.modeling_swin.window_reverse
def window_reverse(windows, window_size, height, width):
"""
Merges windows to produce higher resolution features.
"""
# 确定窗口的通道数
num_channels = windows.shape[-1]
# 将窗口合并为更高分辨率的特征
windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
# 对合并后的特征进行重新排序,以符合原始输入的形状
windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
return windows
# Copied from transformers.models.beit.modeling_beit.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
# 如果 drop_prob 为 0 或不处于训练模式,则直接返回输入
if drop_prob == 0.0 or not training:
return input
# 计算保留的概率
keep_prob = 1 - drop_prob
# 创建一个与输入形状相同的随机张量
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # 将随机张量二值化
# 应用 drop path 操作,并返回处理后的输出
output = input.div(keep_prob) * random_tensor
return output
# Copied from transformers.models.swin.modeling_swin.SwinDropPath with Swin->Swin2SR
class Swin2SRDropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 调用 drop_path 函数来执行 drop path 操作
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
class Swin2SREmbeddings(nn.Module):
"""
Construct the patch and optional position embeddings.
"""
# 初始化函数,接受一个配置参数config
def __init__(self, config):
# 调用父类的初始化方法
super().__init__()
# 使用配置参数初始化Swin2SRPatchEmbeddings对象,赋值给self.patch_embeddings
self.patch_embeddings = Swin2SRPatchEmbeddings(config)
# 获取patch数目,用于后续位置编码的初始化
num_patches = self.patch_embeddings.num_patches
# 根据配置决定是否创建位置编码的参数
if config.use_absolute_embeddings:
# 创建一个形状为(1, num_patches + 1, config.embed_dim)的可学习参数,初始值为全零
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
else:
# 如果不使用绝对位置编码,则置为None
self.position_embeddings = None
# 初始化一个dropout层,使用给定的dropout概率
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# 保存配置中的窗口大小参数
self.window_size = config.window_size
# 前向传播函数,接受一个可选的torch.FloatTensor类型的像素值作为输入,返回一个torch.Tensor类型的元组
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor]:
# 调用patch_embeddings对象处理输入像素值,返回嵌入张量和输出维度信息
embeddings, output_dimensions = self.patch_embeddings(pixel_values)
# 如果位置编码参数不为None,则将嵌入张量和位置编码相加
if self.position_embeddings is not None:
embeddings = embeddings + self.position_embeddings
# 对嵌入张量应用dropout操作
embeddings = self.dropout(embeddings)
# 返回处理后的嵌入张量和输出维度信息的元组
return embeddings, output_dimensions
class Swin2SRPatchEmbeddings(nn.Module):
# Swin2SRPatchEmbeddings 类的定义,继承自 nn.Module
def __init__(self, config, normalize_patches=True):
super().__init__()
# 初始化函数,接收配置参数和是否标准化补丁的标志
num_channels = config.embed_dim
# 从配置中获取嵌入维度
image_size, patch_size = config.image_size, config.patch_size
# 从配置中获取图像尺寸和补丁尺寸
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# 确保图像尺寸和补丁尺寸是可迭代对象,如果不是,则转换为元组形式
patches_resolution = [image_size[0] // patch_size[0], image_size[1] // patch_size[1]]
# 计算补丁的分辨率,即图像被划分成的补丁数目
self.patches_resolution = patches_resolution
self.num_patches = patches_resolution[0] * patches_resolution[1]
# 设置补丁的分辨率和补丁的总数
self.projection = nn.Conv2d(num_channels, config.embed_dim, kernel_size=patch_size, stride=patch_size)
# 使用卷积层进行投影,将输入的通道数转换为嵌入维度,卷积核大小为补丁大小,步长为补丁大小
self.layernorm = nn.LayerNorm(config.embed_dim) if normalize_patches else None
# 如果需要对补丁进行标准化,则使用 LayerNorm 进行处理,否则设为 None
def forward(self, embeddings: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
# 前向传播函数,接收嵌入向量作为输入,返回嵌入后的张量和输出维度的元组
embeddings = self.projection(embeddings)
# 使用定义的投影层对输入的嵌入向量进行投影变换
_, _, height, width = embeddings.shape
# 获取投影后张量的高度和宽度信息
output_dimensions = (height, width)
# 记录输出的高度和宽度信息
embeddings = embeddings.flatten(2).transpose(1, 2)
# 将投影后的张量按照第三维度展平,然后进行转置操作
if self.layernorm is not None:
embeddings = self.layernorm(embeddings)
# 如果定义了 LayerNorm 层,则对嵌入向量进行标准化处理
return embeddings, output_dimensions
# 返回处理后的嵌入向量和输出的尺寸信息
class Swin2SRPatchUnEmbeddings(nn.Module):
# Swin2SRPatchUnEmbeddings 类的定义,继承自 nn.Module
r"""Image to Patch Unembedding"""
def __init__(self, config):
super().__init__()
# 初始化函数,接收配置参数
self.embed_dim = config.embed_dim
# 设置嵌入维度为配置中指定的值
def forward(self, embeddings, x_size):
# 前向传播函数,接收嵌入向量和图像尺寸作为输入
batch_size, height_width, num_channels = embeddings.shape
# 获取输入嵌入向量的批量大小、高度宽度乘积以及通道数
embeddings = embeddings.transpose(1, 2).view(batch_size, self.embed_dim, x_size[0], x_size[1]) # B Ph*Pw C
# 将嵌入向量进行转置和视图变换,以重构原始图像尺寸
return embeddings
# 返回重构后的嵌入向量
# Copied from transformers.models.swinv2.modeling_swinv2.Swinv2PatchMerging with Swinv2->Swin2SR
class Swin2SRPatchMerging(nn.Module):
# Swin2SRPatchMerging 类的定义,继承自 nn.Module
"""
Patch Merging Layer.
Args:
input_resolution (`Tuple[int]`):
Resolution of input feature.
dim (`int`):
Number of input channels.
norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
Normalization layer class.
"""
def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
# 初始化函数,接收输入特征的分辨率、输入通道数和可选的标准化层
self.input_resolution = input_resolution
# 设置输入特征的分辨率属性
self.dim = dim
# 设置输入通道数属性
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
# 使用线性层进行维度减少,从 4*dim 到 2*dim,无偏置项
self.norm = norm_layer(2 * dim)
# 使用指定的标准化层对输出进行标准化处理
def maybe_pad(self, input_feature, height, width):
# 辅助函数,可能对输入特征进行填充,使得其高度和宽度为偶数
should_pad = (height % 2 == 1) or (width % 2 == 1)
# 检查输入特征的高度或宽度是否为奇数
if should_pad:
pad_values = (0, 0, 0, width % 2, 0, height % 2)
# 计算需要填充的值
input_feature = nn.functional.pad(input_feature, pad_values)
# 使用 PyTorch 的函数进行填充操作
return input_feature
# 返回填充后的输入特征
def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
# 解包输入维度元组
height, width = input_dimensions
# `dim` 是输入特征的维度,即 height * width
batch_size, dim, num_channels = input_feature.shape
# 将输入特征重新视图化为四维张量 [batch_size, height, width, num_channels]
input_feature = input_feature.view(batch_size, height, width, num_channels)
# 如果需要,对输入进行填充使其可以被 width 和 height 整除
input_feature = self.maybe_pad(input_feature, height, width)
# 提取四个子块,每个块大小为 [batch_size, height/2, width/2, num_channels]
input_feature_0 = input_feature[:, 0::2, 0::2, :]
input_feature_1 = input_feature[:, 1::2, 0::2, :]
input_feature_2 = input_feature[:, 0::2, 1::2, :]
input_feature_3 = input_feature[:, 1::2, 1::2, :]
# 将四个子块沿最后一个维度拼接,形成新的特征张量 [batch_size, height/2, width/2, 4*num_channels]
input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
# 将特征张量重新视图化为 [batch_size, height/2 * width/2, 4*num_channels]
input_feature = input_feature.view(batch_size, -1, 4 * num_channels)
# 使用 reduction 方法对特征张量进行降维处理
input_feature = self.reduction(input_feature)
# 使用 norm 方法对降维后的特征张量进行归一化处理
input_feature = self.norm(input_feature)
# 返回处理后的特征张量作为输出
return input_feature
# 从transformers.models.swinv2.modeling_swinv2.Swinv2SelfAttention复制而来,将Swinv2改为Swin2SR
class Swin2SRSelfAttention(nn.Module):
# 将输入张量x重新形状以用于注意力分数计算
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(new_x_shape)
return x.permute(0, 2, 1, 3)
# 自注意力机制的前向传播
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]: # 函数声明,返回类型为包含单个张量的元组
batch_size, dim, num_channels = hidden_states.shape # 获取隐藏状态张量的形状信息
mixed_query_layer = self.query(hidden_states) # 使用 query 网络对隐藏状态进行处理得到混合查询层
key_layer = self.transpose_for_scores(self.key(hidden_states)) # 使用 key 网络对隐藏状态进行处理,然后转置以用于注意力计算
value_layer = self.transpose_for_scores(self.value(hidden_states)) # 使用 value 网络对隐藏状态进行处理,然后转置以用于注意力计算
query_layer = self.transpose_for_scores(mixed_query_layer) # 对混合查询层进行转置以用于注意力计算
# cosine attention
attention_scores = nn.functional.normalize(query_layer, dim=-1) @ nn.functional.normalize(
key_layer, dim=-1
).transpose(-2, -1) # 计算注意力分数,使用余弦相似度进行归一化,然后进行乘积计算
logit_scale = torch.clamp(self.logit_scale, max=math.log(1.0 / 0.01)).exp() # 限制并指数化对数缩放参数
attention_scores = attention_scores * logit_scale # 缩放注意力分数
relative_position_bias_table = self.continuous_position_bias_mlp(self.relative_coords_table).view(
-1, self.num_attention_heads
) # 使用位置偏置 MLP 计算连续位置偏置表,并进行形状重塑
# [window_height*window_width,window_height*window_width,num_attention_heads]
relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
) # 根据相对位置索引选择相对位置偏置表中的偏置,并进行形状调整
# [num_attention_heads,window_height*window_width,window_height*window_width]
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # 调整相对位置偏置的维度顺序
relative_position_bias = 16 * torch.sigmoid(relative_position_bias) # 对相对位置偏置进行 sigmoid 处理并乘以常数 16
attention_scores = attention_scores + relative_position_bias.unsqueeze(0) # 添加相对位置偏置到注意力分数中
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in Swin2SRModel forward() function)
mask_shape = attention_mask.shape[0] # 获取注意力掩码的形状信息
attention_scores = attention_scores.view(
batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
) + attention_mask.unsqueeze(1).unsqueeze(0) # 将注意力分数调整为与掩码相匹配的形状,并应用掩码
attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0) # 再次应用掩码
attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim) # 调整注意力分数的形状
# Normalize the attention scores to probabilities.
attention_probs = nn.functional.softmax(attention_scores, dim=-1) # 对注意力分数进行 softmax 归一化
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs) # 使用 dropout 对注意力概率进行处理
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask # 如果有头部掩码,则将其应用到注意力概率上
context_layer = torch.matmul(attention_probs, value_layer) # 使用注意力概率与值层进行加权求和得到上下文层
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() # 调整上下文层的维度顺序
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) # 计算新的上下文层形状
context_layer = context_layer.view(new_context_layer_shape) # 根据计算的形状调整上下文层的形状
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) # 根据是否需要输出注意力分数来选择输出内容
return outputs # 返回上下文层和(如果需要)注意力分数
# 从 transformers.models.swin.modeling_swin.SwinSelfOutput 复制并修改为 Swin2SRSelfOutput 类
class Swin2SRSelfOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 创建一个线性层,输入和输出维度均为 dim
self.dense = nn.Linear(dim, dim)
# 创建一个 dropout 层,使用 config 中指定的 dropout 概率
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 将输入的 hidden_states 传入 dense 线性层
hidden_states = self.dense(hidden_states)
# 将经过线性层的 hidden_states 应用 dropout
hidden_states = self.dropout(hidden_states)
return hidden_states
# 从 transformers.models.swinv2.modeling_swinv2.Swinv2Attention 复制并修改为 Swin2SRAttention 类
class Swin2SRAttention(nn.Module):
def __init__(self, config, dim, num_heads, window_size, pretrained_window_size=0):
super().__init__()
# 初始化 self 层,即 Swin2SRSelfAttention 对象
self.self = Swin2SRSelfAttention(
config=config,
dim=dim,
num_heads=num_heads,
window_size=window_size,
pretrained_window_size=pretrained_window_size
if isinstance(pretrained_window_size, collections.abc.Iterable)
else (pretrained_window_size, pretrained_window_size),
)
# 初始化 output 层,即 Swin2SRSelfOutput 对象
self.output = Swin2SRSelfOutput(config, dim)
# 初始化一个空集合,用于存储剪枝的注意力头信息
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
# 查找可剪枝的注意力头和其索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 对线性层进行剪枝
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储已剪枝的头信息
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 执行自注意力机制,并获取 self_outputs
self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
# 将 self_outputs[0] 作为输入,hidden_states 作为辅助输入,传入 output 层
attention_output = self.output(self_outputs[0], hidden_states)
# 如果输出注意力信息,则将 attentions 添加到 outputs 中
outputs = (attention_output,) + self_outputs[1:] # 如果需要输出 attentions,则添加到 outputs 中
return outputs
# 从 transformers.models.swin.modeling_swin.SwinIntermediate 复制并修改为 Swin2SRIntermediate 类
class Swin2SRIntermediate(nn.Module):
# 初始化函数,用于创建一个新的神经网络层
def __init__(self, config, dim):
# 调用父类的初始化函数
super().__init__()
# 创建一个全连接层,将输入维度 dim 映射到 int(config.mlp_ratio * dim) 的输出维度
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
# 根据配置选择隐藏层激活函数,如果配置中隐藏层激活函数是字符串,则从预定义的映射中选择对应的函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
# 否则直接使用配置中指定的隐藏层激活函数
self.intermediate_act_fn = config.hidden_act
# 前向传播函数,处理输入的隐藏状态张量并返回处理后的张量
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 将输入的隐藏状态张量通过全连接层进行线性变换
hidden_states = self.dense(hidden_states)
# 将线性变换后的张量输入到中间激活函数中进行非线性变换
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回变换后的张量作为输出
return hidden_states
# 从transformers.models.swin.modeling_swin.SwinOutput复制并将Swin改为Swin2SR
class Swin2SROutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 创建一个线性层,将输入维度乘以config.mlp_ratio,输出维度为dim
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
# 创建一个Dropout层,以config.hidden_dropout_prob的概率丢弃神经元
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 前向传播函数,首先通过线性层处理隐藏状态
hidden_states = self.dense(hidden_states)
# 然后对处理后的状态应用Dropout操作
hidden_states = self.dropout(hidden_states)
# 返回处理后的隐藏状态
return hidden_states
# 从transformers.models.swinv2.modeling_swinv2.Swinv2Layer复制并将Swinv2改为Swin2SR
class Swin2SRLayer(nn.Module):
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
super().__init__()
# 设置输入分辨率
self.input_resolution = input_resolution
# 计算窗口大小和移动尺寸
window_size, shift_size = self._compute_window_shift(
(config.window_size, config.window_size), (shift_size, shift_size)
)
# 选择第一个维度的窗口大小和移动尺寸
self.window_size = window_size[0]
self.shift_size = shift_size[0]
# 创建Swin2SRAttention层,传入config、dim、num_heads、window_size和pretrained_window_size参数
self.attention = Swin2SRAttention(
config=config,
dim=dim,
num_heads=num_heads,
window_size=self.window_size,
pretrained_window_size=pretrained_window_size
if isinstance(pretrained_window_size, collections.abc.Iterable)
else (pretrained_window_size, pretrained_window_size),
)
# 创建LayerNorm层,归一化dim维度的输入,eps为config.layer_norm_eps
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 创建Swin2SRDropPath层,如果config.drop_path_rate大于0.0则应用DropPath,否则为恒等映射
self.drop_path = Swin2SRDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
# 创建Swin2SRIntermediate层,处理输入为config和dim的中间层
self.intermediate = Swin2SRIntermediate(config, dim)
# 创建Swin2SROutput层,处理输入为config和dim的输出层
self.output = Swin2SROutput(config, dim)
# 创建LayerNorm层,归一化dim维度的输出,eps为config.layer_norm_eps
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
def _compute_window_shift(self, target_window_size, target_shift_size) -> Tuple[Tuple[int, int], Tuple[int, int]]:
# 计算窗口大小和移动尺寸的函数,返回目标窗口大小和目标移动尺寸
window_size = [r if r <= w else w for r, w in zip(self.input_resolution, target_window_size)]
shift_size = [0 if r <= w else s for r, w, s in zip(self.input_resolution, window_size, target_shift_size)]
return window_size, shift_size
````
# 根据窗口移动大小生成注意力掩码,用于移位窗口的多头自注意力
def get_attn_mask(self, height, width, dtype):
if self.shift_size > 0:
# 创建一个全零的张量作为图像的注意力掩码
img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
# 定义高度和宽度的切片,用于生成多个窗口
height_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
width_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
count = 0
# 在图像的每个窗口位置设置对应的编号
for height_slice in height_slices:
for width_slice in width_slices:
img_mask[:, height_slice, width_slice, :] = count
count += 1
# 将图像分块,每个块的大小为窗口大小乘以窗口大小
mask_windows = window_partition(img_mask, self.window_size)
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
# 创建注意力掩码,表示不同窗口之间的相对位置
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
# 使用特定值填充掩码,0位置用0.0填充,非0位置用-100.0填充
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
else:
# 如果不需要移位,返回空的注意力掩码
attn_mask = None
return attn_mask
# 在需要时对隐藏状态进行填充,以适应窗口大小的整数倍
def maybe_pad(self, hidden_states, height, width):
# 计算需要右侧和底部填充的像素数,使其可以被窗口大小整除
pad_right = (self.window_size - width % self.window_size) % self.window_size
pad_bottom = (self.window_size - height % self.window_size) % self.window_size
# 定义填充值的元组,格式为(top, bottom, left, right, ...)
pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
# 对隐藏状态进行填充操作
hidden_states = nn.functional.pad(hidden_states, pad_values)
return hidden_states, def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
# 定义函数签名,指定输入和输出类型为 torch.Tensor 的元组
height, width = input_dimensions
# 解包输入维度
batch_size, _, channels = hidden_states.size()
# 获取隐藏状态的批量大小、高度、宽度和通道数
shortcut = hidden_states
# 保存隐藏状态的快捷方式
# pad hidden_states to multiples of window size
# 将隐藏状态填充到窗口大小的倍数
hidden_states = hidden_states.view(batch_size, height, width, channels)
# 调整隐藏状态的形状为 [batch_size, height, width, channels]
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
# 调用 maybe_pad 方法,可能对隐藏状态进行填充,同时获取填充值
_, height_pad, width_pad, _ = hidden_states.shape
# 解包填充后的隐藏状态的形状
# cyclic shift
# 循环移位操作
if self.shift_size > 0:
# 如果 shift_size 大于 0
shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
# 在维度 (1, 2) 上对隐藏状态进行负向移位操作
else:
shifted_hidden_states = hidden_states
# 否则,不进行移位操作,保持隐藏状态不变
# partition windows
# 划分窗口
hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
# 调用 window_partition 方法,将移位后的隐藏状态划分为窗口
hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
# 将划分后的窗口重新视图为 [batch_size * num_windows, window_size * window_size, channels]
attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
# 调用 get_attn_mask 方法,获取注意力掩码
if attn_mask is not None:
attn_mask = attn_mask.to(hidden_states_windows.device)
# 如果注意力掩码不为空,则将其移到与 hidden_states_windows 相同的设备上
attention_outputs = self.attention(
hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
)
# 调用 attention 方法,进行注意力计算
attention_output = attention_outputs[0]
# 获取注意力输出的第一个元素
attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
# 将注意力输出重新视图为 [batch_size * num_windows, window_size, window_size, channels]
shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
# 调用 window_reverse 方法,逆转注意力窗口
# reverse cyclic shift
# 逆转循环移位
if self.shift_size > 0:
# 如果 shift_size 大于 0
attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
# 在维度 (1, 2) 上对注意力窗口进行正向移位操作
else:
attention_windows = shifted_windows
# 否则,不进行移位操作,保持注意力窗口不变
was_padded = pad_values[3] > 0 or pad_values[5] > 0
# 判断是否进行了填充
if was_padded:
attention_windows = attention_windows[:, :height, :width, :].contiguous()
# 如果进行了填充,则截取注意力窗口的有效部分
attention_windows = attention_windows.view(batch_size, height * width, channels)
# 将注意力窗口重新视图为 [batch_size, height * width, channels]
hidden_states = self.layernorm_before(attention_windows)
# 调用 layernorm_before 方法,对注意力窗口进行层归一化处理
hidden_states = shortcut + self.drop_path(hidden_states)
# 将快捷方式与经过 drop_path 处理后的隐藏状态相加
layer_output = self.intermediate(hidden_states)
# 调用 intermediate 方法,生成中间层输出
layer_output = self.output(layer_output)
# 调用 output 方法,生成输出层输出
layer_output = hidden_states + self.drop_path(self.layernorm_after(layer_output))
# 将隐藏状态与经过 drop_path 和层归一化后的输出相加
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
# 如果需要输出注意力,则返回包含注意力输出的元组,否则只返回输出层输出的元组
return layer_outputs
# 返回层输出的元组
class Swin2SRStage(nn.Module):
"""
This corresponds to the Residual Swin Transformer Block (RSTB) in the original implementation.
"""
def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, pretrained_window_size=0):
super().__init__()
self.config = config # 初始化模型配置参数
self.dim = dim # 初始化模型维度参数
# 创建包含多个Swin2SRLayer层的ModuleList
self.layers = nn.ModuleList(
[
Swin2SRLayer(
config=config,
dim=dim,
input_resolution=input_resolution,
num_heads=num_heads,
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
pretrained_window_size=pretrained_window_size,
)
for i in range(depth)
]
)
# 根据配置参数选择不同的残差连接方式
if config.resi_connection == "1conv":
self.conv = nn.Conv2d(dim, dim, 3, 1, 1)
elif config.resi_connection == "3conv":
# 采用序列化方式创建多层卷积神经网络
self.conv = nn.Sequential(
nn.Conv2d(dim, dim // 4, 3, 1, 1),
nn.LeakyReLU(negative_slope=0.2, inplace=True),
nn.Conv2d(dim // 4, dim // 4, 1, 1, 0),
nn.LeakyReLU(negative_slope=0.2, inplace=True),
nn.Conv2d(dim // 4, dim, 3, 1, 1),
)
# 创建Swin2SRPatchEmbeddings对象
self.patch_embed = Swin2SRPatchEmbeddings(config, normalize_patches=False)
# 创建Swin2SRPatchUnEmbeddings对象
self.patch_unembed = Swin2SRPatchUnEmbeddings(config)
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
residual = hidden_states # 保存输入的隐藏状态作为残差
height, width = input_dimensions # 获取输入图像的高度和宽度
for i, layer_module in enumerate(self.layers):
layer_head_mask = head_mask[i] if head_mask is not None else None
# 调用Swin2SRLayer的forward方法进行前向传播
layer_outputs = layer_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
hidden_states = layer_outputs[0] # 更新隐藏状态输出
output_dimensions = (height, width, height, width) # 设置输出的图像维度
hidden_states = self.patch_unembed(hidden_states, input_dimensions) # 反向解嵌入处理
hidden_states = self.conv(hidden_states) # 应用卷积层处理隐藏状态
hidden_states, _ = self.patch_embed(hidden_states) # 应用图像嵌入处理
hidden_states = hidden_states + residual # 加上残差连接
stage_outputs = (hidden_states, output_dimensions) # 定义阶段输出结果
if output_attentions:
stage_outputs += layer_outputs[1:] # 如果需要输出注意力,将其添加到输出结果中
return stage_outputs # 返回阶段输出结果
# 初始化函数,接受配置对象和网格大小作为参数
def __init__(self, config, grid_size):
# 调用父类初始化方法
super().__init__()
# 计算阶段数量,即深度列表的长度
self.num_stages = len(config.depths)
# 保存配置对象
self.config = config
# 计算丢弃路径率数组,根据配置的丢弃路径率和各个阶段的深度
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
# 创建阶段列表,每个阶段是一个Swin2SRStage模块
self.stages = nn.ModuleList(
[
Swin2SRStage(
config=config,
dim=config.embed_dim,
input_resolution=(grid_size[0], grid_size[1]),
depth=config.depths[stage_idx],
num_heads=config.num_heads[stage_idx],
drop_path=dpr[sum(config.depths[:stage_idx]) : sum(config.depths[: stage_idx + 1])],
pretrained_window_size=0,
)
for stage_idx in range(self.num_stages)
]
)
# 是否启用梯度检查点,默认为False
self.gradient_checkpointing = False
# 前向传播函数
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
return_dict: Optional[bool] = True,
) -> Union[Tuple, Swin2SREncoderOutput]:
# 初始化所有输入尺寸为空元组
all_input_dimensions = ()
# 如果需要输出隐藏状态,则初始化为空元组
all_hidden_states = () if output_hidden_states else None
# 如果需要输出注意力权重,则初始化为空元组
all_self_attentions = () if output_attentions else None
# 如果需要输出隐藏状态,则添加当前隐藏状态到all_hidden_states中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 遍历所有阶段
for i, stage_module in enumerate(self.stages):
# 获取当前阶段的头部掩码
layer_head_mask = head_mask[i] if head_mask is not None else None
# 如果启用了梯度检查点并且正在训练阶段,则使用梯度检查点函数
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
stage_module.__call__, hidden_states, input_dimensions, layer_head_mask, output_attentions
)
else:
# 否则,直接调用阶段模块进行前向传播
layer_outputs = stage_module(hidden_states, input_dimensions, layer_head_mask, output_attentions)
# 更新隐藏状态为当前层的输出的第一个元素
hidden_states = layer_outputs[0]
# 更新输入尺寸为当前层输出的维度
output_dimensions = layer_outputs[1]
input_dimensions = (output_dimensions[-2], output_dimensions[-1])
# 将当前层的输出维度添加到所有输入尺寸中
all_input_dimensions += (input_dimensions,)
# 如果需要输出隐藏状态,则添加当前隐藏状态到all_hidden_states中
if output_hidden_states:
all_hidden_states += (hidden_states,)
# 如果需要输出注意力权重,则将当前层的注意力权重添加到all_self_attentions中
if output_attentions:
all_self_attentions += layer_outputs[2:]
# 如果不需要返回字典形式的输出,则返回隐藏状态、所有隐藏状态和所有注意力权重
if not return_dict:
return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
# 否则,返回Swin2SREncoderOutput对象,包含最终隐藏状态、所有隐藏状态和所有注意力权重
return Swin2SREncoderOutput(
last_hidden_state=hidden_states,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
)
class Swin2SRPreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 指定配置类
config_class = Swin2SRConfig
# 基础模型前缀
base_model_prefix = "swin2sr"
# 主输入名称
main_input_name = "pixel_values"
# 支持梯度检查点
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
# 初始化模块的权重
if isinstance(module, (nn.Linear, nn.Conv2d)):
torch.nn.init.trunc_normal_(module.weight.data, std=self.config.initializer_range)
# 如果存在偏置,将其初始化为零
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.LayerNorm):
# 初始化 LayerNorm 的偏置为零,权重为1
module.bias.data.zero_()
module.weight.data.fill_(1.0)
# Swin2SRModel 类的文档字符串
SWIN2SR_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`Swin2SRConfig`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# Swin2SRModel 类的输入文档字符串
SWIN2SR_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
[`Swin2SRImageProcessor.__call__`] for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Swin2SR Model transformer outputting raw hidden-states without any specific head on top.",
SWIN2SR_START_DOCSTRING,
)
# Swin2SRModel 类,继承自 Swin2SRPreTrainedModel,用于构建模型
class Swin2SRModel(Swin2SRPreTrainedModel):
def __init__(self, config):
# 调用父类构造函数初始化对象
super().__init__(config)
# 保存配置信息到对象属性
self.config = config
# 根据配置信息设置均值张量
if config.num_channels == 3 and config.num_channels_out == 3:
rgb_mean = (0.4488, 0.4371, 0.4040)
self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1)
else:
self.mean = torch.zeros(1, 1, 1, 1)
self.img_range = config.img_range
# 创建第一个卷积层
self.first_convolution = nn.Conv2d(config.num_channels, config.embed_dim, 3, 1, 1)
# 创建嵌入层
self.embeddings = Swin2SREmbeddings(config)
# 创建编码器
self.encoder = Swin2SREncoder(config, grid_size=self.embeddings.patch_embeddings.patches_resolution)
# 创建层归一化层
self.layernorm = nn.LayerNorm(config.embed_dim, eps=config.layer_norm_eps)
# 创建补丁解嵌入层
self.patch_unembed = Swin2SRPatchUnEmbeddings(config)
# 创建主体后的卷积层
self.conv_after_body = nn.Conv2d(config.embed_dim, config.embed_dim, 3, 1, 1)
# 调用后初始化方法,初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 返回嵌入层的补丁嵌入对象
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
# 遍历要修剪的头信息,对编码器中对应层的自注意力机制进行修剪
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def pad_and_normalize(self, pixel_values):
_, _, height, width = pixel_values.size()
# 1. 执行填充操作
window_size = self.config.window_size
modulo_pad_height = (window_size - height % window_size) % window_size
modulo_pad_width = (window_size - width % window_size) % window_size
pixel_values = nn.functional.pad(pixel_values, (0, modulo_pad_width, 0, modulo_pad_height), "reflect")
# 2. 执行归一化操作
self.mean = self.mean.type_as(pixel_values)
pixel_values = (pixel_values - self.mean) * self.img_range
return pixel_values
@add_start_docstrings_to_model_forward(SWIN2SR_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=BaseModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: torch.FloatTensor,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
# 如果没有显式指定,根据配置决定是否输出注意力权重
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 如果没有显式指定,根据配置决定是否输出隐藏状态
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果没有显式指定,根据配置决定是否使用返回字典形式
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 准备头部掩码(如果需要)
# 在头部掩码中,1.0 表示保留该头部
# attention_probs 的形状为 bsz x n_heads x N x N
# 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
# head_mask 被转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, len(self.config.depths))
_, _, height, width = pixel_values.shape
# 一些预处理:填充 + 归一化
pixel_values = self.pad_and_normalize(pixel_values)
# 第一个卷积层处理像素值
embeddings = self.first_convolution(pixel_values)
# 将卷积后的结果传递给嵌入层处理,同时获取输入维度信息
embedding_output, input_dimensions = self.embeddings(embeddings)
# 编码器处理嵌入输出
encoder_outputs = self.encoder(
embedding_output,
input_dimensions,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 取出编码器的输出的第一个元素作为序列输出
sequence_output = encoder_outputs[0]
# 序列输出经过 LayerNormalization 处理
sequence_output = self.layernorm(sequence_output)
# 将序列输出重新映射到原始尺寸上
sequence_output = self.patch_unembed(sequence_output, (height, width))
# 经过主体后的卷积操作,加上初始的嵌入值
sequence_output = self.conv_after_body(sequence_output) + embeddings
# 如果不使用返回字典形式,则输出为包含序列输出和其他编码器输出的元组
if not return_dict:
output = (sequence_output,) + encoder_outputs[1:]
return output
# 如果使用返回字典形式,则构造 BaseModelOutput 对象返回
return BaseModelOutput(
last_hidden_state=sequence_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
class PixelShuffleUpsampler(nn.Module):
"""PixelShuffleUpsampler module.
This module performs upsampling using PixelShuffle.
Args:
config (`object`):
Configuration object containing parameters.
num_features (`int`):
Number of intermediate features.
Attributes:
conv_before_upsample (`nn.Conv2d`):
Convolutional layer before upsampling.
activation (`nn.LeakyReLU`):
LeakyReLU activation function.
upsample (`Upsample`):
Upsample module.
final_convolution (`nn.Conv2d`):
Final convolutional layer.
"""
def __init__(self, config, num_features):
super().__init__()
# Initialize convolution before upsampling
self.conv_before_upsample = nn.Conv2d(config.embed_dim, num_features, 3, 1, 1)
# Initialize activation function
self.activation = nn.LeakyReLU(inplace=True)
# Initialize upsampling module
self.upsample = Upsample(config.upscale, num_features)
# Initialize final convolutional layer
self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)
def forward(self, sequence_output):
# Apply convolution before upsampling
x = self.conv_before_upsample(sequence_output)
# Apply activation function
x = self.activation(x)
# Apply upsampling using the Upsample module
x = self.upsample(x)
# Apply final convolutional layer
x = self.final_convolution(x)
return x
class NearestConvUpsampler(nn.Module):
"""NearestConvUpsampler module.
This module performs upsampling using nearest-neighbor interpolation followed by convolution.
Args:
scale (`int`):
Scale factor for upsampling.
in_channels (`int`):
Number of input channels.
out_channels (`int`):
Number of output channels.
Attributes:
upsample (`nn.Upsample`):
Upsampling layer.
conv (`nn.Conv2d`):
Convolutional layer.
"""
def __init__(self, config, num_features):
super().__init__()
# 检查是否需要进行4倍上采样,否则抛出数值错误异常
if config.upscale != 4:
raise ValueError("The nearest+conv upsampler only supports an upscale factor of 4 at the moment.")
# 第一层卷积,将输入特征维度转换为num_features,卷积核大小为3x3,填充为1
self.conv_before_upsample = nn.Conv2d(config.embed_dim, num_features, 3, 1, 1)
# 激活函数,使用LeakyReLU
self.activation = nn.LeakyReLU(inplace=True)
# 上采样卷积层1,输入和输出特征维度都为num_features,卷积核大小为3x3,填充为1
self.conv_up1 = nn.Conv2d(num_features, num_features, 3, 1, 1)
# 上采样卷积层2,输入和输出特征维度都为num_features,卷积核大小为3x3,填充为1
self.conv_up2 = nn.Conv2d(num_features, num_features, 3, 1, 1)
# 高分辨率恢复卷积层,输入和输出特征维度都为num_features,卷积核大小为3x3,填充为1
self.conv_hr = nn.Conv2d(num_features, num_features, 3, 1, 1)
# 最终卷积层,将特征维度转换为config.num_channels_out,卷积核大小为3x3,填充为1
self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)
# LeakyReLU激活函数,斜率为0.2,inplace操作
self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
def forward(self, sequence_output):
# 序列输出先经过第一层卷积
sequence_output = self.conv_before_upsample(sequence_output)
# 经过激活函数
sequence_output = self.activation(sequence_output)
# 上采样至原始大小的两倍,并经过LeakyReLU激活函数
sequence_output = self.lrelu(
self.conv_up1(torch.nn.functional.interpolate(sequence_output, scale_factor=2, mode="nearest"))
)
# 再次上采样至原始大小的四倍,并经过LeakyReLU激活函数
sequence_output = self.lrelu(
self.conv_up2(torch.nn.functional.interpolate(sequence_output, scale_factor=2, mode="nearest"))
)
# 最终的重建,经过高分辨率恢复卷积层和LeakyReLU激活函数
reconstruction = self.final_convolution(self.lrelu(self.conv_hr(sequence_output)))
# 返回重建的结果
return reconstruction
# 定义像素混洗辅助上采样器模块的类,用于图像超分辨率和恢复任务
class PixelShuffleAuxUpsampler(nn.Module):
def __init__(self, config, num_features):
super().__init__()
# 从配置中获取上采样比例
self.upscale = config.upscale
# 定义使用三通道卷积进行双三次插值的卷积层
self.conv_bicubic = nn.Conv2d(config.num_channels, num_features, 3, 1, 1)
# 定义用于上采样前的卷积层
self.conv_before_upsample = nn.Conv2d(config.embed_dim, num_features, 3, 1, 1)
# 定义激活函数为LeakyReLU
self.activation = nn.LeakyReLU(inplace=True)
# 定义用于辅助任务的卷积层,将序列输出映射到通道数为config.num_channels的张量
self.conv_aux = nn.Conv2d(num_features, config.num_channels, 3, 1, 1)
# 定义用于辅助任务后续处理的序列卷积和LeakyReLU激活函数的顺序层
self.conv_after_aux = nn.Sequential(nn.Conv2d(3, num_features, 3, 1, 1), nn.LeakyReLU(inplace=True))
# 定义上采样模块
self.upsample = Upsample(config.upscale, num_features)
# 定义最终的卷积层,将上采样后的特征映射到config.num_channels_out的输出通道
self.final_convolution = nn.Conv2d(num_features, config.num_channels_out, 3, 1, 1)
def forward(self, sequence_output, bicubic, height, width):
# 对双三次插值结果进行卷积操作
bicubic = self.conv_bicubic(bicubic)
# 对序列输出进行上采样前的卷积操作
sequence_output = self.conv_before_upsample(sequence_output)
# 序列输出经过激活函数处理
sequence_output = self.activation(sequence_output)
# 对序列输出进行辅助任务的卷积操作
aux = self.conv_aux(sequence_output)
# 经过辅助任务卷积后的序列输出再次进行卷积和激活函数处理
sequence_output = self.conv_after_aux(aux)
# 序列输出经过上采样模块,根据指定的高度和宽度进行裁剪
sequence_output = (
self.upsample(sequence_output)[:, :, : height * self.upscale, : width * self.upscale]
+ bicubic[:, :, : height * self.upscale, : width * self.upscale]
)
# 最终将上采样后的序列输出进行最终卷积操作,生成重建图像
reconstruction = self.final_convolution(sequence_output)
return reconstruction, aux
# 使用添加文档字符串装饰器为Swin2SRForImageSuperResolution类添加说明
@add_start_docstrings(
"""
Swin2SR模型的变压器,顶部带有一个上采样器头部,用于图像超分辨率和恢复。
""",
SWIN2SR_START_DOCSTRING,
)
class Swin2SRForImageSuperResolution(Swin2SRPreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化Swin2SR模型
self.swin2sr = Swin2SRModel(config)
# 获取配置中的上采样器类型和上采样比例
self.upsampler = config.upsampler
self.upscale = config.upscale
# 根据上采样器类型选择对应的上采样器模块
num_features = 64
if self.upsampler == "pixelshuffle":
self.upsample = PixelShuffleUpsampler(config, num_features)
elif self.upsampler == "pixelshuffle_aux":
self.upsample = PixelShuffleAuxUpsampler(config, num_features)
elif self.upsampler == "pixelshuffledirect":
# 轻量级超分辨率模型,只进行一步上采样
self.upsample = UpsampleOneStep(config.upscale, config.embed_dim, config.num_channels_out)
elif self.upsampler == "nearest+conv":
# 适用于真实世界超分辨率,减少伪影的最近邻插值加卷积上采样器
self.upsample = NearestConvUpsampler(config, num_features)
else:
# 用于图像去噪和JPEG压缩伪影减少的最终卷积层
self.final_convolution = nn.Conv2d(config.embed_dim, config.num_channels_out, 3, 1, 1)
# 初始化权重并应用最终处理
self.post_init()
# 使用添加文档字符串装饰器为forward方法添加输入说明
@add_start_docstrings_to_model_forward(SWIN2SR_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=ImageSuperResolutionOutput, config_class=_CONFIG_FOR_DOC)
# 定义一个方法 `forward`,用于模型的前向传播
#
# 参数说明:
# - pixel_values: 可选的 torch.FloatTensor,表示输入的像素值
# - head_mask: 可选的 torch.FloatTensor,表示注意力头部的掩码
# - labels: 可选的 torch.LongTensor,表示标签数据
# - output_attentions: 可选的 bool 值,控制是否输出注意力权重
# - output_hidden_states: 可选的 bool 值,控制是否输出隐藏状态
# - return_dict: 可选的 bool 值,控制是否以字典形式返回结果
.\models\swin2sr\__init__.py
# 版权声明和许可证信息,声明代码版权归 HuggingFace 团队所有,使用 Apache License 2.0 许可证发布
# 可以在符合许可证的情况下使用此文件。许可证详细信息可在 http://www.apache.org/licenses/LICENSE-2.0 获取
#
# 如果不符合适用法律或未经书面同意,则根据"AS IS"基础分发软件,无任何明示或暗示的担保或条件
from typing import TYPE_CHECKING
# 导入 OptionalDependencyNotAvailable 异常类、_LazyModule 类以及检查 torch 和 vision 是否可用的函数
from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
# 定义导入结构的字典,包含模块到需要导入的类、函数的映射
_import_structure = {
"configuration_swin2sr": ["SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swin2SRConfig"],
}
# 检查是否可以导入 torch
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable() # 如果 torch 不可用则抛出 OptionalDependencyNotAvailable 异常
except OptionalDependencyNotAvailable:
pass # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
else:
# 如果 torch 可用,则添加 modeling_swin2sr 模块到导入结构中
_import_structure["modeling_swin2sr"] = [
"SWIN2SR_PRETRAINED_MODEL_ARCHIVE_LIST",
"Swin2SRForImageSuperResolution",
"Swin2SRModel",
"Swin2SRPreTrainedModel",
]
# 检查是否可以导入 vision
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable() # 如果 vision 不可用则抛出 OptionalDependencyNotAvailable 异常
except OptionalDependencyNotAvailable:
pass # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
else:
# 如果 vision 可用,则添加 image_processing_swin2sr 模块到导入结构中
_import_structure["image_processing_swin2sr"] = ["Swin2SRImageProcessor"]
# 如果在类型检查模式下
if TYPE_CHECKING:
# 导入 configuration_swin2sr 模块中的特定类和变量
from .configuration_swin2sr import SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP, Swin2SRConfig
# 检查是否可以导入 torch
try:
if not is_torch_available():
raise OptionalDependencyNotAvailable() # 如果 torch 不可用则抛出 OptionalDependencyNotAvailable 异常
except OptionalDependencyNotAvailable:
pass # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
else:
# 导入 modeling_swin2sr 模块中的特定类和变量
from .modeling_swin2sr import (
SWIN2SR_PRETRAINED_MODEL_ARCHIVE_LIST,
Swin2SRForImageSuperResolution,
Swin2SRModel,
Swin2SRPreTrainedModel,
)
# 检查是否可以导入 vision
try:
if not is_vision_available():
raise OptionalDependencyNotAvailable() # 如果 vision 不可用则抛出 OptionalDependencyNotAvailable 异常
except OptionalDependencyNotAvailable:
pass # 如果出现 OptionalDependencyNotAvailable 异常则不执行后续代码
else:
# 导入 image_processing_swin2sr 模块中的特定类
from .image_processing_swin2sr import Swin2SRImageProcessor
# 如果不在类型检查模式下,则将当前模块映射到 _LazyModule,延迟导入模块,以及动态导入 _import_structure 中定义的模块
else:
import sys
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
.\models\swinv2\configuration_swinv2.py
# 设置编码格式为 UTF-8
# 版权声明和许可证,声明代码版权归 HuggingFace Inc. 团队所有,遵循 Apache License 2.0 版本
# 只有在遵守许可证的情况下才能使用此文件。您可以在以下网址获取许可证的副本:
# http://www.apache.org/licenses/LICENSE-2.0
# 如果适用法律要求或书面同意,本软件按"原样"分发,不提供任何明示或暗示的担保或条件。
# 有关详细信息,请参阅许可证。
""" Swinv2 Transformer model configuration"""
# 导入必要的模块和类
from ...configuration_utils import PretrainedConfig
from ...utils import logging
from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
# 获取日志记录器
logger = logging.get_logger(__name__)
# Swinv2 模型预训练配置文件映射,指定模型的预训练配置文件位置
SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"microsoft/swinv2-tiny-patch4-window8-256": (
"https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256/resolve/main/config.json"
),
}
# Swinv2Config 类,用于存储 Swinv2 模型的配置信息
class Swinv2Config(BackboneConfigMixin, PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`Swinv2Model`]. It is used to instantiate a Swin
Transformer v2 model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Swin Transformer v2
[microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256)
architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Example:
```
>>> from transformers import Swinv2Config, Swinv2Model
>>> # Initializing a Swinv2 microsoft/swinv2-tiny-patch4-window8-256 style configuration
>>> configuration = Swinv2Config()
>>> # Initializing a model (with random weights) from the microsoft/swinv2-tiny-patch4-window8-256 style configuration
>>> model = Swinv2Model(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
# 模型类型为 Swinv2
model_type = "swinv2"
# 属性映射表,将一些属性名映射为另一些属性名
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
# 初始化函数,用于初始化一个Swing Transformer模型的参数
def __init__(
self,
image_size=224, # 图像尺寸,默认为224
patch_size=4, # 每个patch的大小,默认为4
num_channels=3, # 输入图像的通道数,默认为3(RGB图像)
embed_dim=96, # 嵌入维度,默认为96
depths=[2, 2, 6, 2], # 各个阶段的深度列表,默认为[2, 2, 6, 2]
num_heads=[3, 6, 12, 24], # 各个阶段的注意力头数列表,默认为[3, 6, 12, 24]
window_size=7, # 窗口大小,默认为7
pretrained_window_sizes=[0, 0, 0, 0], # 预训练窗口大小列表,默认为[0, 0, 0, 0]
mlp_ratio=4.0, # MLP放大比例,默认为4.0
qkv_bias=True, # 是否使用注意力的查询、键、值偏置,默认为True
hidden_dropout_prob=0.0, # 隐藏层dropout概率,默认为0.0(无dropout)
attention_probs_dropout_prob=0.0, # 注意力概率dropout概率,默认为0.0(无dropout)
drop_path_rate=0.1, # drop path的概率,默认为0.1
hidden_act="gelu", # 隐藏层激活函数,默认为gelu
use_absolute_embeddings=False, # 是否使用绝对位置嵌入,默认为False
initializer_range=0.02, # 初始化范围,默认为0.02
layer_norm_eps=1e-5, # LayerNorm的epsilon,默认为1e-5
encoder_stride=32, # 编码器步长,默认为32
out_features=None, # 输出特征列表,默认为None
out_indices=None, # 输出索引列表,默认为None
**kwargs, # 其他关键字参数
):
super().__init__(**kwargs)
# 设置各种参数到对象的属性中
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths) # 设置阶段的数量为depths列表的长度
self.num_heads = num_heads
self.window_size = window_size
self.pretrained_window_sizes = pretrained_window_sizes
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
self.encoder_stride = encoder_stride
self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
# 获取对齐的输出特征和输出索引,以便与VisionEncoderDecoderModel兼容
self._out_features, self._out_indices = get_aligned_output_features_output_indices(
out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
)
# 设置hidden_size属性,表示模型最后一个阶段之后的通道维度
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
.\models\swinv2\convert_swinv2_timm_to_pytorch.py
# 设置编码格式为UTF-8
# 版权声明和许可信息,声明本代码版权归HuggingFace Inc.团队所有,并遵循Apache License 2.0许可
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert Swinv2 checkpoints from the timm library."""
import argparse # 导入命令行参数解析模块
import json # 导入JSON处理模块
from pathlib import Path # 导入路径操作模块
import requests # 导入HTTP请求库
import timm # 导入模型库timm
import torch # 导入PyTorch深度学习库
from huggingface_hub import hf_hub_download # 导入Hugging Face模型中心下载函数
from PIL import Image # 导入PIL图像处理库
from transformers import AutoImageProcessor, Swinv2Config, Swinv2ForImageClassification # 导入transformers库中相关模块
def get_swinv2_config(swinv2_name):
config = Swinv2Config() # 创建一个Swinv2Config配置对象
name_split = swinv2_name.split("_") # 使用下划线分割模型名称
model_size = name_split[1] # 提取模型尺寸信息
if "to" in name_split[3]:
img_size = int(name_split[3][-3:]) # 提取图像尺寸信息
else:
img_size = int(name_split[3])
if "to" in name_split[2]:
window_size = int(name_split[2][-2:]) # 提取窗口大小信息
else:
window_size = int(name_split[2][6:])
# 根据模型尺寸选择对应的嵌入维度、深度和头数配置
if model_size == "tiny":
embed_dim = 96
depths = (2, 2, 6, 2)
num_heads = (3, 6, 12, 24)
elif model_size == "small":
embed_dim = 96
depths = (2, 2, 18, 2)
num_heads = (3, 6, 12, 24)
elif model_size == "base":
embed_dim = 128
depths = (2, 2, 18, 2)
num_heads = (4, 8, 16, 32)
else:
embed_dim = 192
depths = (2, 2, 18, 2)
num_heads = (6, 12, 24, 48)
# 如果模型名称中包含'to',设置预训练窗口大小配置
if "to" in swinv2_name:
config.pretrained_window_sizes = (12, 12, 12, 6)
# 根据模型名称和数据集情况设置相应的类别数和标签映射
if ("22k" in swinv2_name) and ("to" not in swinv2_name):
num_classes = 21841
repo_id = "huggingface/label-files"
filename = "imagenet-22k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
else:
num_classes = 1000
repo_id = "huggingface/label-files"
filename = "imagenet-1k-id2label.json"
id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
id2label = {int(k): v for k, v in id2label.items()}
config.id2label = id2label
config.label2id = {v: k for k, v in id2label.items()}
# 设置配置对象的图像大小、类别数、嵌入维度、深度、头数和窗口大小
config.image_size = img_size
config.num_labels = num_classes
config.embed_dim = embed_dim
config.depths = depths
config.num_heads = num_heads
config.window_size = window_size
return config
def rename_key(name):
# 如果文件名中包含 "patch_embed.proj",替换为 "embeddings.patch_embeddings.projection"
if "patch_embed.proj" in name:
name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection")
# 如果文件名中包含 "patch_embed.norm",替换为 "embeddings.norm"
if "patch_embed.norm" in name:
name = name.replace("patch_embed.norm", "embeddings.norm")
# 如果文件名中包含 "layers",在前面加上 "encoder."
if "layers" in name:
name = "encoder." + name
# 如果文件名中包含 "attn.proj",替换为 "attention.output.dense"
if "attn.proj" in name:
name = name.replace("attn.proj", "attention.output.dense")
# 如果文件名中包含 "attn",替换为 "attention.self"
if "attn" in name:
name = name.replace("attn", "attention.self")
# 如果文件名中包含 "norm1",替换为 "layernorm_before"
if "norm1" in name:
name = name.replace("norm1", "layernorm_before")
# 如果文件名中包含 "norm2",替换为 "layernorm_after"
if "norm2" in name:
name = name.replace("norm2", "layernorm_after")
# 如果文件名中包含 "mlp.fc1",替换为 "intermediate.dense"
if "mlp.fc1" in name:
name = name.replace("mlp.fc1", "intermediate.dense")
# 如果文件名中包含 "mlp.fc2",替换为 "output.dense"
if "mlp.fc2" in name:
name = name.replace("mlp.fc2", "output.dense")
# 如果文件名中包含 "q_bias",替换为 "query.bias"
if "q_bias" in name:
name = name.replace("q_bias", "query.bias")
# 如果文件名中包含 "k_bias",替换为 "key.bias"
if "k_bias" in name:
name = name.replace("k_bias", "key.bias")
# 如果文件名中包含 "v_bias",替换为 "value.bias"
if "v_bias" in name:
name = name.replace("v_bias", "value.bias")
# 如果文件名中包含 "cpb_mlp",替换为 "continuous_position_bias_mlp"
if "cpb_mlp" in name:
name = name.replace("cpb_mlp", "continuous_position_bias_mlp")
# 如果文件名为 "norm.weight",替换为 "layernorm.weight"
if name == "norm.weight":
name = "layernorm.weight"
# 如果文件名为 "norm.bias",替换为 "layernorm.bias"
if name == "norm.bias":
name = "layernorm.bias"
# 如果文件名中包含 "head",替换为 "classifier";否则在文件名前面加上 "swinv2."
if "head" in name:
name = name.replace("head", "classifier")
else:
name = "swinv2." + name
# 返回处理后的文件名
return name
# 定义一个函数,用于转换模型的状态字典,以适配特定模型结构
def convert_state_dict(orig_state_dict, model):
# 遍历原始状态字典的键(复制的列表),逐一处理
for key in orig_state_dict.copy().keys():
# 弹出当前键对应的值
val = orig_state_dict.pop(key)
# 如果键名中包含 "mask",则跳过当前循环
if "mask" in key:
continue
# 如果键名中包含 "qkv"
elif "qkv" in key:
# 拆分键名为列表
key_split = key.split(".")
# 获取层号和块号
layer_num = int(key_split[1])
block_num = int(key_split[3])
# 获取注意力机制的维度
dim = model.swinv2.encoder.layers[layer_num].blocks[block_num].attention.self.all_head_size
# 如果键名中包含 "weight"
if "weight" in key:
# 更新状态字典,设置查询权重
orig_state_dict[
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.weight"
] = val[:dim, :]
# 更新状态字典,设置键权重
orig_state_dict[
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.weight"
] = val[dim : dim * 2, :]
# 更新状态字典,设置值权重
orig_state_dict[
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.weight"
] = val[-dim:, :]
else:
# 更新状态字典,设置查询偏置
orig_state_dict[
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.query.bias"
] = val[:dim]
# 更新状态字典,设置键偏置
orig_state_dict[
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.key.bias"
] = val[dim : dim * 2]
# 更新状态字典,设置值偏置
orig_state_dict[
f"swinv2.encoder.layers.{layer_num}.blocks.{block_num}.attention.self.value.bias"
] = val[-dim:]
else:
# 对于其余键,通过 rename_key 函数重命名后存储
orig_state_dict[rename_key(key)] = val
# 返回更新后的原始状态字典
return orig_state_dict
# 定义一个函数,用于将 timm 模型的状态字典转换为 swinv2 模型的状态字典
def convert_swinv2_checkpoint(swinv2_name, pytorch_dump_folder_path):
# 使用 timm 库创建指定预训练模型的模型对象
timm_model = timm.create_model(swinv2_name, pretrained=True)
# 将模型设置为评估模式
timm_model.eval()
# 获取 swinv2 模型的配置
config = get_swinv2_config(swinv2_name)
# 创建 swinv2 模型对象
model = Swinv2ForImageClassification(config)
# 将 swinv2 模型设置为评估模式
model.eval()
# 转换 timm 模型的状态字典为适应 swinv2 模型的新状态字典
new_state_dict = convert_state_dict(timm_model.state_dict(), model)
# 加载新的状态字典到 swinv2 模型中
model.load_state_dict(new_state_dict)
# 定义要使用的示例图像的 URL
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# 使用 AutoImageProcessor 从预训练模型加载图像处理器
image_processor = AutoImageProcessor.from_pretrained("microsoft/{}".format(swinv2_name.replace("_", "-")))
# 打开图像并转换为 PIL 图像对象
image = Image.open(requests.get(url, stream=True).raw)
# 使用图像处理器将图像转换为模型输入的张量表示
inputs = image_processor(images=image, return_tensors="pt")
# 使用 timm 模型对输入图像进行推理
timm_outs = timm_model(inputs["pixel_values"])
# 使用 swinv2 模型对输入图像进行推理,获取分类 logits
hf_outs = model(**inputs).logits
# 断言两个模型输出的值在给定的误差范围内接近
assert torch.allclose(timm_outs, hf_outs, atol=1e-3)
# 打印保存模型的信息
print(f"Saving model {swinv2_name} to {pytorch_dump_folder_path}")
# 将 swinv2 模型保存到指定路径
model.save_pretrained(pytorch_dump_folder_path)
# 打印保存图像处理器的信息
print(f"Saving image processor to {pytorch_dump_folder_path}")
# 将图像处理器保存到指定路径
image_processor.save_pretrained(pytorch_dump_folder_path)
# 将模型推送到指定的 Hub 仓库
model.push_to_hub(
repo_path_or_name=Path(pytorch_dump_folder_path, swinv2_name),
organization="nandwalritik",
commit_message="Add model",
)
# 如果当前脚本作为主程序运行,则执行以下代码
if __name__ == "__main__":
# 创建解析器对象
parser = argparse.ArgumentParser()
# 添加必需的参数说明
# (这里省略了具体的参数添加,因为没有提供详细的代码示例)
parser.add_argument(
"--swinv2_name", # 定义一个命令行参数,用于指定要转换的Swinv2模型的名称
default="swinv2_tiny_patch4_window8_256", # 默认参数值为"swinv2_tiny_patch4_window8_256"
type=str, # 参数类型为字符串
help="Name of the Swinv2 timm model you'd like to convert.", # 参数的帮助信息,解释了该参数的作用
)
parser.add_argument(
"--pytorch_dump_folder_path", # 定义另一个命令行参数,用于指定输出PyTorch模型的目录路径
default=None, # 默认值为None
type=str, # 参数类型为字符串
help="Path to the output PyTorch model directory." # 参数的帮助信息,解释了该参数的作用
)
args = parser.parse_args() # 解析命令行参数,将参数存储在args对象中
convert_swinv2_checkpoint(args.swinv2_name, args.pytorch_dump_folder_path)
# 调用函数convert_swinv2_checkpoint,传入解析后的参数args中的swinv2_name和pytorch_dump_folder_path作为参数
.\models\swinv2\modeling_swinv2.py
# 设置文件编码为 UTF-8
# 版权声明:2022 年由 Microsoft Research 和 The HuggingFace Inc. 团队保留所有权利
#
# 根据 Apache 许可证 2.0 版本授权,除非符合许可证规定,否则不得使用此文件
# 您可以在以下网址获取许可证的副本:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意,否则依据“原样”提供,不提供任何明示或暗示的保证或条件
# 请参阅许可证获取详细信息
""" PyTorch Swinv2 Transformer model."""
import collections.abc # 导入集合抽象基类,用于类型检查
import math # 导入数学库,用于数学计算
import warnings # 导入警告模块,用于处理警告
from dataclasses import dataclass # 导入 dataclass 装饰器,用于创建数据类
from typing import Optional, Tuple, Union # 导入类型提示相关的模块
import torch # 导入 PyTorch 库
import torch.utils.checkpoint # 导入 PyTorch 的 checkpoint 模块,用于实现模型的内存优化
from torch import Tensor, nn # 导入 PyTorch 的张量和神经网络模块
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss # 导入损失函数
from ...activations import ACT2FN # 导入激活函数映射
from ...modeling_outputs import BackboneOutput # 导入模型输出类
from ...modeling_utils import PreTrainedModel # 导入预训练模型基类
from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer # 导入模型工具函数
from ...utils import (
ModelOutput, # 导入模型输出基类
add_code_sample_docstrings, # 导入用于添加代码示例文档字符串的函数
add_start_docstrings, # 导入用于添加起始文档字符串的函数
add_start_docstrings_to_model_forward, # 导入用于模型前向方法的起始文档字符串函数
logging, # 导入日志模块
replace_return_docstrings, # 导入用于替换返回文档字符串的函数
)
from ...utils.backbone_utils import BackboneMixin # 导入骨干网络相关的工具函数
from .configuration_swinv2 import Swinv2Config # 导入 Swinv2 模型的配置类
logger = logging.get_logger(__name__) # 获取当前模块的日志记录器
# 用于文档的配置文件名
_CONFIG_FOR_DOC = "Swinv2Config"
# 用于文档的检查点信息
_CHECKPOINT_FOR_DOC = "microsoft/swinv2-tiny-patch4-window8-256"
# 预期输出形状的说明
_EXPECTED_OUTPUT_SHAPE = [1, 64, 768]
# 图像分类检查点信息
_IMAGE_CLASS_CHECKPOINT = "microsoft/swinv2-tiny-patch4-window8-256"
# 图像分类预期输出的示例
_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
# Swinv2 预训练模型的存档列表
SWINV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
"microsoft/swinv2-tiny-patch4-window8-256",
# 可在 https://huggingface.co/models?filter=swinv2 查看所有 Swinv2 模型
]
# 以下定义部分来自 https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/swin_transformer_v2.py.
@dataclass
# 从 transformers.models.swin.modeling_swin.SwinEncoderOutput 复制并将 Swin->Swinv2
class Swinv2EncoderOutput(ModelOutput):
"""
Swinv2 编码器的输出,可能包含隐藏状态和注意力权重。
# 最后一层模型的隐藏状态,形状为(batch_size, sequence_length, hidden_size)
last_hidden_state: torch.FloatTensor = None
# 模型每一层的隐藏状态的元组,形状为(batch_size, sequence_length, hidden_size),可选项,当`output_hidden_states=True`时返回
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 注意力权重的元组,形状为(batch_size, num_heads, sequence_length, sequence_length),可选项,当`output_attentions=True`时返回
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 模型每一层的隐藏状态的元组,形状为(batch_size, hidden_size, height, width),包括空间维度,可选项,当`output_hidden_states=True`且输出被重塑时返回
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 使用 dataclass 装饰器定义 Swinv2ModelOutput 类,它继承自 ModelOutput 类
# ModelOutput 是一个基础类,可能在 transformers 库中定义
@dataclass
# 从 transformers.models.swin.modeling_swin.SwinModelOutput 复制的类定义,将 Swin 替换为 Swinv2
class Swinv2ModelOutput(ModelOutput):
"""
Swinv2 模型的输出,同时包含最后隐藏状态的池化结果。
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
模型最后一层的隐藏状态序列输出。
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, 当 `add_pooling_layer=True` 时返回):
最后一层隐藏状态的平均池化结果。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
包含模型每一层隐藏状态的元组,以及初始嵌入输出。
形状为 `(batch_size, sequence_length, hidden_size)`。
attentions (`tuple(torch.FloatTensor)`, *optional*, 当 `output_attentions=True` 或 `config.output_attentions=True` 时返回):
自注意力机制 softmax 后的注意力权重,用于计算自注意力头的加权平均值。
形状为 `(batch_size, num_heads, sequence_length, sequence_length)`。
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当 `output_hidden_states=True` 或 `config.output_hidden_states=True` 时返回):
包含模型每一层隐藏状态的元组,以及初始嵌入输出,重塑为包含空间维度的形状。
形状为 `(batch_size, hidden_size, height, width)`。
"""
@dataclass
# 从 transformers.models.swin.modeling_swin.SwinMaskedImageModelingOutput 复制的类定义,将 Swin 替换为 Swinv2
class Swinv2MaskedImageModelingOutput(ModelOutput):
"""
Swinv2 掩码图像模型的输出。
这个类定义可能还需要填充完整,以匹配 Swinv2 模型的具体输出内容和结构。
通常来说,这些数据类定义了模型输出的结构,包括各个部分的详细说明。
你可以根据实际的 Swinv2 模型输出来进一步补充这个类的内容。
例如,可以包括类似于上面 Swinv2ModelOutput 类的参数说明,描述具体的模型输出内容和形状。
"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
Masked image modeling (MLM) loss.
图像模型的掩码损失(MLM损失)。
reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Reconstructed pixel values.
重建的像素数值。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
模型在每一层输出的隐藏状态,包括初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
注意力权重经过注意力softmax后的结果,用于计算自注意力头中的加权平均。
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
shape `(batch_size, hidden_size, height, width)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
include the spatial dimensions.
模型在每一层输出的隐藏状态,包括重塑以包括空间维度的初始嵌入输出。
"""
# 定义属性loss,类型为Optional[torch.FloatTensor],默认值为None
loss: Optional[torch.FloatTensor] = None
# 定义属性reconstruction,类型为torch.FloatTensor,默认值为None
reconstruction: torch.FloatTensor = None
# 定义属性hidden_states,类型为Optional[Tuple[torch.FloatTensor, ...]],默认值为None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义属性attentions,类型为Optional[Tuple[torch.FloatTensor, ...]],默认值为None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# 定义属性reshaped_hidden_states,类型为Optional[Tuple[torch.FloatTensor, ...]],默认值为None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
@property
def logits(self):
# 警告信息,提醒logits属性在Transformers的版本5中将被移除,建议使用reconstruction属性获取最终输出。
warnings.warn(
"logits attribute is deprecated and will be removed in version 5 of Transformers."
" Please use the reconstruction attribute to retrieve the final output instead.",
FutureWarning,
)
# 返回属性reconstruction的值作为logits属性的输出
return self.reconstruction
@dataclass
# 从transformers.models.swin.modeling_swin.SwinImageClassifierOutput复制到Swinv2ImageClassifierOutput
class Swinv2ImageClassifierOutput(ModelOutput):
"""
Swinv2图像分类的输出。
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, 当提供`labels`时返回):
分类(如果config.num_labels==1则是回归)损失。
logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
分类(如果config.num_labels==1则是回归)得分(SoftMax之前)。
hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当`output_hidden_states=True`时返回或者当`config.output_hidden_states=True`时返回):
包含每层输出的`torch.FloatTensor`元组,形状为`(batch_size, sequence_length, hidden_size)`。
每个层的模型隐藏状态加上初始嵌入输出。
attentions (`tuple(torch.FloatTensor)`, *optional*, 当`output_attentions=True`时返回或者当`config.output_attentions=True`时返回):
包含每个阶段`torch.FloatTensor`元组,形状为`(batch_size, num_heads, sequence_length, sequence_length)`。
注意力softmax后的注意力权重,用于计算自注意力头的加权平均值。
reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, 当`output_hidden_states=True`时返回或者当`config.output_hidden_states=True`时返回):
包含每层输出的`torch.FloatTensor`元组,形状为`(batch_size, hidden_size, height, width)`。
每个层的模型隐藏状态加上初始嵌入输出,重塑以包含空间维度。
"""
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
# 从transformers.models.swin.modeling_swin.window_partition复制
def window_partition(input_feature, window_size):
"""
将给定输入分区为窗口。
"""
batch_size, height, width, num_channels = input_feature.shape
input_feature = input_feature.view(
batch_size, height // window_size, window_size, width // window_size, window_size, num_channels
)
windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels)
return windows
# 从transformers.models.swin.modeling_swin.window_reverse复制
def window_reverse(windows, window_size, height, width):
"""
合并窗口以产生更高分辨率的特征。
"""
# 获取窗口数组的通道数量
num_channels = windows.shape[-1]
# 将窗口数组重塑为指定窗口大小的网格结构
windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
# 对重塑后的窗口数组进行维度置换,以便重新排列窗口的顺序
windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous()
# 再次将重排后的窗口数组展平为原始形状
windows = windows.view(-1, height, width, num_channels)
# 返回重新排列和重塑后的窗口数组
return windows
# Copied from transformers.models.swin.modeling_swin.drop_path
def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
"""
# 如果 drop_prob 为 0 或者不处于训练模式,则直接返回输入
if drop_prob == 0.0 or not training:
return input
# 计算保留概率
keep_prob = 1 - drop_prob
# 确定随机张量的形状
shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
# 生成均匀分布的随机张量,并进行二值化处理
random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
random_tensor.floor_() # binarize
# 对输入进行按元素除法,并应用二值化的随机张量
output = input.div(keep_prob) * random_tensor
return output
# Copied from transformers.models.swin.modeling_swin.SwinDropPath with Swin->Swinv2
class Swinv2DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
def __init__(self, drop_prob: Optional[float] = None) -> None:
super().__init__()
self.drop_prob = drop_prob
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 调用 drop_path 函数,传递当前实例的 drop_prob 属性和训练模式
return drop_path(hidden_states, self.drop_prob, self.training)
def extra_repr(self) -> str:
return "p={}".format(self.drop_prob)
# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->Swinv2
class Swinv2Embeddings(nn.Module):
"""
Construct the patch and position embeddings. Optionally, also the mask token.
"""
def __init__(self, config, use_mask_token=False):
super().__init__()
# 初始化 Swinv2PatchEmbeddings 实例
self.patch_embeddings = Swinv2PatchEmbeddings(config)
num_patches = self.patch_embeddings.num_patches
self.patch_grid = self.patch_embeddings.grid_size
# 如果 use_mask_token 为真,则初始化一个用于掩码的张量参数
self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None
# 根据配置决定是否初始化位置编码张量参数
if config.use_absolute_embeddings:
self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim))
else:
self.position_embeddings = None
# 初始化 LayerNorm 层和 Dropout 层
self.norm = nn.LayerNorm(config.embed_dim)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(
self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
):
# 省略了 forward 方法的其余部分,用于构造图像块和位置编码的嵌入
pass
) -> Tuple[torch.Tensor]:
# 获取图像块的嵌入表示和输出维度信息
embeddings, output_dimensions = self.patch_embeddings(pixel_values)
# 对嵌入表示进行归一化处理
embeddings = self.norm(embeddings)
# 获取批处理大小、序列长度以及嵌入表示的最后一个维度大小
batch_size, seq_len, _ = embeddings.size()
# 如果存在掩码位置信息
if bool_masked_pos is not None:
# 使用mask_token在整个批次上扩展以替换掩码的视觉标记
mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
# 创建掩码,使其类型与mask_tokens一致,并在嵌入表示中应用
mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
# 如果存在位置嵌入,则将其加到嵌入表示中
if self.position_embeddings is not None:
embeddings = embeddings + self.position_embeddings
# 对嵌入表示进行dropout处理
embeddings = self.dropout(embeddings)
# 返回处理后的嵌入表示和输出维度信息
return embeddings, output_dimensions
# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->Swinv2
class Swinv2PatchEmbeddings(nn.Module):
"""
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
"""
def __init__(self, config):
super().__init__()
# Extract configuration parameters
image_size, patch_size = config.image_size, config.patch_size
num_channels, hidden_size = config.num_channels, config.embed_dim
# Ensure image_size and patch_size are iterable
image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
# Calculate number of patches
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
# Initialize instance variables
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.num_patches = num_patches
self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
# Projection layer: Conv2d for patch embedding
self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
def maybe_pad(self, pixel_values, height, width):
# Pad pixel_values if height or width is not divisible by patch_size
if width % self.patch_size[1] != 0:
pad_values = (0, self.patch_size[1] - width % self.patch_size[1])
pixel_values = nn.functional.pad(pixel_values, pad_values)
if height % self.patch_size[0] != 0:
pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0])
pixel_values = nn.functional.pad(pixel_values, pad_values)
return pixel_values
def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]:
# Retrieve dimensions of pixel_values
_, num_channels, height, width = pixel_values.shape
# Check if number of channels matches self.num_channels
if num_channels != self.num_channels:
raise ValueError(
"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
)
# Pad input pixel_values to ensure divisibility by patch_size
pixel_values = self.maybe_pad(pixel_values, height, width)
# Project pixel_values into patch embeddings
embeddings = self.projection(pixel_values)
# Retrieve dimensions of embeddings after projection
_, _, height, width = embeddings.shape
# Flatten embeddings and transpose dimensions for further processing
embeddings = embeddings.flatten(2).transpose(1, 2)
# Return embeddings and output dimensions
return embeddings, (height, width)
class Swinv2PatchMerging(nn.Module):
"""
Patch Merging Layer.
Args:
input_resolution (`Tuple[int]`):
Resolution of input feature.
dim (`int`):
Number of input channels.
norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
Normalization layer class.
"""
def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
super().__init__()
self.input_resolution = input_resolution # 设置输入分辨率
self.dim = dim # 设置维度
self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) # 初始化线性变换层,减少维度
self.norm = norm_layer(2 * dim) # 初始化规范化层
def maybe_pad(self, input_feature, height, width):
should_pad = (height % 2 == 1) or (width % 2 == 1) # 检查是否需要填充
if should_pad:
pad_values = (0, 0, 0, width % 2, 0, height % 2) # 计算填充值
input_feature = nn.functional.pad(input_feature, pad_values) # 执行填充操作
return input_feature
def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor:
height, width = input_dimensions # 解析输入尺寸
# `dim` is height * width
batch_size, dim, num_channels = input_feature.shape # 获取输入特征的形状信息
input_feature = input_feature.view(batch_size, height, width, num_channels) # 重新组织输入特征的形状
# pad input to be disible by width and height, if needed
input_feature = self.maybe_pad(input_feature, height, width) # 调用填充函数,确保特征是宽高可整除的
# [batch_size, height/2, width/2, num_channels]
input_feature_0 = input_feature[:, 0::2, 0::2, :] # 提取特征的子块1
# [batch_size, height/2, width/2, num_channels]
input_feature_1 = input_feature[:, 1::2, 0::2, :] # 提取特征的子块2
# [batch_size, height/2, width/2, num_channels]
input_feature_2 = input_feature[:, 0::2, 1::2, :] # 提取特征的子块3
# [batch_size, height/2, width/2, num_channels]
input_feature_3 = input_feature[:, 1::2, 1::2, :] # 提取特征的子块4
# [batch_size, height/2 * width/2, 4*num_channels]
input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1) # 将四个子块合并
input_feature = input_feature.view(batch_size, -1, 4 * num_channels) # 重新组织合并后的特征形状
input_feature = self.reduction(input_feature) # 执行线性变换
input_feature = self.norm(input_feature) # 执行规范化操作
return input_feature # 返回处理后的特征
# 定义一个名为Swinv2SelfAttention的自定义神经网络模块类
class Swinv2SelfAttention(nn.Module):
# 定义一个用于将输入张量x转换为注意力分数形状的方法
def transpose_for_scores(self, x):
# 计算新的张量形状,保留除了最后一维外的所有维度,并增加注意力头数和每个头的大小
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
# 重新调整张量的形状
x = x.view(new_x_shape)
# 对调张量的维度顺序,将第0和第2个维度互换,第1和第3个维度互换
return x.permute(0, 2, 1, 3)
# 定义前向传播方法,接受隐藏状态张量、注意力掩码、头部掩码和输出注意力的可选参数
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 获取输入张量的维度信息
batch_size, dim, num_channels = hidden_states.shape
# 使用 self.query 对隐藏状态进行查询操作,生成混合的查询层
mixed_query_layer = self.query(hidden_states)
# 使用 self.key 对隐藏状态进行键操作,并转置以便计算注意力分数
key_layer = self.transpose_for_scores(self.key(hidden_states))
# 使用 self.value 对隐藏状态进行值操作,并转置以便后续计算上下文层
value_layer = self.transpose_for_scores(self.value(hidden_states))
# 对混合的查询层也进行转置以便计算注意力分数
query_layer = self.transpose_for_scores(mixed_query_layer)
# 使用余弦相似度计算注意力分数
attention_scores = nn.functional.normalize(query_layer, dim=-1) @ nn.functional.normalize(
key_layer, dim=-1
).transpose(-2, -1)
# 对注意力分数进行缩放,使用 torch.clamp 限制缩放因子的最大值
logit_scale = torch.clamp(self.logit_scale, max=math.log(1.0 / 0.01)).exp()
attention_scores = attention_scores * logit_scale
# 使用 MLP 模块计算相对位置偏置,并重新组织形状以匹配注意力分数
relative_position_bias_table = self.continuous_position_bias_mlp(self.relative_coords_table).view(
-1, self.num_attention_heads
)
relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
)
relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
relative_position_bias = 16 * torch.sigmoid(relative_position_bias)
attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
# 如果存在注意力遮罩,则将其应用于注意力分数
if attention_mask is not None:
mask_shape = attention_mask.shape[0]
attention_scores = attention_scores.view(
batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim
) + attention_mask.unsqueeze(1).unsqueeze(0)
attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim)
# 将注意力分数归一化为注意力概率
attention_probs = nn.functional.softmax(attention_scores, dim=-1)
# 对注意力概率进行 dropout
attention_probs = self.dropout(attention_probs)
# 如果存在头部掩码,则将其应用于注意力概率
if head_mask is not None:
attention_probs = attention_probs * head_mask
# 计算最终的上下文层,将注意力概率与值层相乘
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
# 调整上下文层的形状以符合输出的预期形状
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
# 根据输出设置,构造最终输出结果
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->Swinv2
class Swinv2SelfOutput(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 定义一个线性层,输入和输出维度都为 dim
self.dense = nn.Linear(dim, dim)
# 定义一个 Dropout 层,使用配置中的注意力概率作为丢弃概率
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
# 通过线性层进行变换
hidden_states = self.dense(hidden_states)
# 应用 Dropout 进行随机丢弃
hidden_states = self.dropout(hidden_states)
return hidden_states
class Swinv2Attention(nn.Module):
def __init__(self, config, dim, num_heads, window_size, pretrained_window_size=0):
super().__init__()
# 初始化自注意力层对象,传入配置、维度、头数、窗口大小等参数
self.self = Swinv2SelfAttention(
config=config,
dim=dim,
num_heads=num_heads,
window_size=window_size,
pretrained_window_size=pretrained_window_size
if isinstance(pretrained_window_size, collections.abc.Iterable)
else (pretrained_window_size, pretrained_window_size),
)
# 初始化自注意力输出层对象,传入配置和维度参数
self.output = Swinv2SelfOutput(config, dim)
# 初始化被修剪的注意力头集合为空集合
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
# 寻找可修剪的注意力头及其索引
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# 修剪线性层
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# 更新超参数并存储修剪后的注意力头
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
# 调用自注意力层的前向传播函数
self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
# 将自注意力输出作为输入,通过自注意力输出层进行变换
attention_output = self.output(self_outputs[0], hidden_states)
# 如果输出注意力权重,将它们添加到输出元组中
outputs = (attention_output,) + self_outputs[1:]
return outputs
# Copied from transformers.models.swin.modeling_swin.SwinIntermediate with Swin->Swinv2
class Swinv2Intermediate(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 定义一个线性层,输入维度为 dim,输出维度为 config.mlp_ratio * dim
self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
# 如果配置中的隐藏层激活函数是字符串,使用对应的激活函数;否则使用配置中的激活函数
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else:
self.intermediate_act_fn = config.hidden_act
# 定义神经网络的前向传播函数,接受隐藏状态作为输入张量,返回处理后的张量
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 使用全连接层对隐藏状态进行线性变换
hidden_states = self.dense(hidden_states)
# 对变换后的隐藏状态应用激活函数
hidden_states = self.intermediate_act_fn(hidden_states)
# 返回处理后的隐藏状态张量作为输出
return hidden_states
# 从 transformers.models.swin.modeling_swin.SwinOutput 复制代码,并将类名中的 Swin 改为 Swinv2
class Swinv2Output(nn.Module):
def __init__(self, config, dim):
super().__init__()
# 创建一个全连接层,将输入的特征维度缩放为 config.mlp_ratio * dim
self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
# 定义一个 dropout 层,用于随机丢弃神经元,防止过拟合
self.dropout = nn.Dropout(config.hidden_dropout_prob)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
# 全连接层计算
hidden_states = self.dense(hidden_states)
# dropout 操作
hidden_states = self.dropout(hidden_states)
return hidden_states
# Swinv2Layer 类定义
class Swinv2Layer(nn.Module):
def __init__(self, config, dim, input_resolution, num_heads, shift_size=0, pretrained_window_size=0):
super().__init__()
# 计算窗口大小和位移大小,确保它们不超过输入分辨率
window_size, shift_size = self._compute_window_shift(
(config.window_size, config.window_size), (shift_size, shift_size)
)
# 设置当前层的窗口大小和位移大小
self.window_size = window_size[0]
self.shift_size = shift_size[0]
# 创建 Swinv2Attention 层,用于执行注意力机制
self.attention = Swinv2Attention(
config=config,
dim=dim,
num_heads=num_heads,
window_size=self.window_size,
pretrained_window_size=pretrained_window_size
if isinstance(pretrained_window_size, collections.abc.Iterable)
else (pretrained_window_size, pretrained_window_size),
)
# LayerNorm 层,用于归一化输入数据
self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
# 如果设置了 drop path rate,则创建 Swinv2DropPath 层;否则创建一个恒等映射(Identity)层
self.drop_path = Swinv2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
# Swinv2Intermediate 类,用于中间层的计算
self.intermediate = Swinv2Intermediate(config, dim)
# Swinv2Output 类,用于最终输出的全连接层
self.output = Swinv2Output(config, dim)
# LayerNorm 层,用于归一化输出数据
self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
def _compute_window_shift(self, target_window_size, target_shift_size) -> Tuple[Tuple[int, int], Tuple[int, int]]:
# 计算适应于输入分辨率的窗口大小和位移大小
window_size = [r if r <= w else w for r, w in zip(self.input_resolution, target_window_size)]
shift_size = [0 if r <= w else s for r, w, s in zip(self.input_resolution, window_size, target_shift_size)]
return window_size, shift_size
# 返回注意力掩码
def get_attn_mask(self, height, width, dtype):
if self.shift_size > 0:
# 为了实现窗口移位的多头自注意力机制,计算注意力掩码
# 创建一个全零张量作为初始掩码
img_mask = torch.zeros((1, height, width, 1), dtype=dtype)
# 定义高度和宽度的切片,用于生成窗口之外的掩码
height_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
width_slices = (
slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None),
)
count = 0
# 遍历高度和宽度切片,为每个窗口分配唯一的计数值
for height_slice in height_slices:
for width_slice in width_slices:
img_mask[:, height_slice, width_slice, :] = count
count += 1
# 将整个图像分成窗口,并展平为二维数组
mask_windows = window_partition(img_mask, self.window_size)
mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
# 创建注意力掩码,基于窗口计数之间的差异
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
# 将非零值的位置设为-100.0,零值位置设为0.0
attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
else:
# 如果不需要窗口移位,返回空的注意力掩码
attn_mask = None
return attn_mask
# 对隐藏状态进行可能的填充,使其高度和宽度可被窗口大小整除
def maybe_pad(self, hidden_states, height, width):
# 计算需要在右侧和底部填充的像素数,以确保整数倍窗口大小
pad_right = (self.window_size - width % self.window_size) % self.window_size
pad_bottom = (self.window_size - height % self.window_size) % self.window_size
# 定义填充的数值(左、右、上、下)
pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
# 使用 PyTorch 的函数进行填充操作
hidden_states = nn.functional.pad(hidden_states, pad_values)
return hidden_states, pad_values
# 前向传播函数,接受隐藏状态张量和输入维度,可选的头部掩码和输出注意力
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor, torch.Tensor]:
# 解析输入维度
height, width = input_dimensions
# 解析隐藏状态的批处理大小、高度、宽度、通道数
batch_size, _, channels = hidden_states.size()
# 保存隐藏状态的快捷方式
shortcut = hidden_states
# 将隐藏状态重新形状为(batch_size, height, width, channels)
hidden_states = hidden_states.view(batch_size, height, width, channels)
# 可能对隐藏状态进行填充以使其成为窗口大小的倍数,并获取填充的值
hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
# 获取填充后的高度和宽度
_, height_pad, width_pad, _ = hidden_states.shape
# 如果设定了shift_size,则进行循环移位操作
if self.shift_size > 0:
shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
else:
shifted_hidden_states = hidden_states
# 将移位后的隐藏状态分割成窗口
hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
# 将窗口重新形状为(-1, self.window_size * self.window_size, channels)
hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels)
# 获取注意力掩码
attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
# 如果注意力掩码存在,则将其移动到hidden_states_windows的设备上
if attn_mask is not None:
attn_mask = attn_mask.to(hidden_states_windows.device)
# 应用注意力机制
attention_outputs = self.attention(
hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
)
# 获取注意力输出
attention_output = attention_outputs[0]
# 将注意力输出重新形状为(-1, self.window_size, self.window_size, channels)
attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels)
# 将窗口反转恢复为原始形状
shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
# 如果设定了shift_size,则反转循环移位操作
if self.shift_size > 0:
attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
else:
attention_windows = shifted_windows
# 检查是否进行了填充,如果是,则截取有效部分
was_padded = pad_values[3] > 0 or pad_values[5] > 0
if was_padded:
attention_windows = attention_windows[:, :height, :width, :].contiguous()
# 将窗口形状重新调整为(batch_size, height * width, channels)
attention_windows = attention_windows.view(batch_size, height * width, channels)
# 通过layernorm进行前处理
hidden_states = self.layernorm_before(attention_windows)
# 添加shortcut并进行drop_path操作
hidden_states = shortcut + self.drop_path(hidden_states)
# 应用中间层操作
layer_output = self.intermediate(hidden_states)
# 应用输出层操作
layer_output = self.output(layer_output)
# 应用layernorm后处理并添加drop_path
layer_output = hidden_states + self.drop_path(self.layernorm_after(layer_output))
# 如果需要输出注意力信息,则返回注意力输出
layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
# 返回层输出
return layer_outputs
# 定义 Swinv2Stage 类,作为 Swin Transformer V2 模型的一个阶段
class Swinv2Stage(nn.Module):
# 初始化方法
def __init__(
self, config, dim, input_resolution, depth, num_heads, drop_path, downsample, pretrained_window_size=0
):
super().__init__()
self.config = config # 保存配置参数
self.dim = dim # 特征维度
blocks = []
# 循环创建指定数量的 Swinv2Layer 块
for i in range(depth):
# 创建 Swinv2Layer 块并添加到 blocks 列表中
block = Swinv2Layer(
config=config,
dim=dim,
input_resolution=input_resolution,
num_heads=num_heads,
shift_size=0 if (i % 2 == 0) else config.window_size // 2,
pretrained_window_size=pretrained_window_size,
)
blocks.append(block)
self.blocks = nn.ModuleList(blocks) # 将 blocks 转为 nn.ModuleList
# 如果有下采样层,则初始化下采样方法
if downsample is not None:
self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
else:
self.downsample = None
self.pointing = False # 初始化指向状态为 False
# 前向传播方法
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
) -> Tuple[torch.Tensor]:
height, width = input_dimensions # 获取输入图片的高度和宽度
for i, layer_module in enumerate(self.blocks):
layer_head_mask = head_mask[i] if head_mask is not None else None # 获取当前层的注意力掩码
# 调用每个 Swinv2Layer 块的 forward 方法进行前向传播
layer_outputs = layer_module(
hidden_states,
input_dimensions,
layer_head_mask,
output_attentions,
)
hidden_states = layer_outputs[0] # 更新隐藏状态为当前层的输出
hidden_states_before_downsampling = hidden_states # 保存下采样前的隐藏状态
if self.downsample is not None:
# 计算下采样后的图片尺寸
height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
output_dimensions = (height, width, height_downsampled, width_downsampled) # 输出尺寸信息
# 调用下采样方法对隐藏状态进行下采样处理
hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
else:
output_dimensions = (height, width, height, width) # 如果没有下采样,输出尺寸不变
stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions) # 阶段输出信息
if output_attentions:
stage_outputs += layer_outputs[1:] # 如果需要输出注意力信息,则将其添加到输出中
return stage_outputs # 返回阶段的输出结果
# 初始化函数,用于创建一个 Swin Transformer 模型
def __init__(self, config, grid_size, pretrained_window_sizes=(0, 0, 0, 0)):
# 调用父类的初始化方法
super().__init__()
# 计算模型的层数
self.num_layers = len(config.depths)
# 将配置信息保存到对象中
self.config = config
# 如果配置中指定了预训练窗口大小,则使用配置中的值
if self.config.pretrained_window_sizes is not None:
pretrained_window_sizes = config.pretrained_window_sizes
# 生成一个按照 config.drop_path_rate 线性分布的列表,并转换成 Python 列表
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))]
# 初始化一个空列表用于保存每个阶段的 Swin Transformer 层
layers = []
# 遍历每个层
for i_layer in range(self.num_layers):
# 创建一个 Swin Transformer 的阶段(stage)
stage = Swinv2Stage(
config=config,
# 设置当前层的维度大小为 config.embed_dim * 2^i_layer
dim=int(config.embed_dim * 2**i_layer),
# 设置输入分辨率为原始网格大小除以 2^i_layer
input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)),
# 设置当前层的深度为 config.depths[i_layer]
depth=config.depths[i_layer],
# 设置当前层的注意力头数为 config.num_heads[i_layer]
num_heads=config.num_heads[i_layer],
# 设置当前层的 drop path 策略
drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
# 如果不是最后一层,则进行下采样
downsample=Swinv2PatchMerging if (i_layer < self.num_layers - 1) else None,
# 设置当前层的预训练窗口大小
pretrained_window_size=pretrained_window_sizes[i_layer],
)
# 将当前创建的阶段加入到层列表中
layers.append(stage)
# 将所有的阶段组成的层列表转换为 nn.ModuleList 类型,并保存到对象的 layers 属性中
self.layers = nn.ModuleList(layers)
# 默认关闭梯度检查点
self.gradient_checkpointing = False
# 前向传播函数,定义了模型的前向计算逻辑
def forward(
self,
hidden_states: torch.Tensor,
input_dimensions: Tuple[int, int],
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = False,
output_hidden_states: Optional[bool] = False,
output_hidden_states_before_downsampling: Optional[bool] = False,
return_dict: Optional[bool] = True,
# 从transformers.models.swin.modeling_swin.SwinPreTrainedModel复制的代码,并将Swin->Swinv2,swin->swinv2
class Swinv2PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# 指定该模型使用的配置类
config_class = Swinv2Config
# 基础模型的前缀名称
base_model_prefix = "swinv2"
# 主要输入的名称
main_input_name = "pixel_values"
# 支持梯度检查点的标志
supports_gradient_checkpointing = True
def _init_weights(self, module):
"""Initialize the weights"""
# 如果模块是线性层或卷积层
if isinstance(module, (nn.Linear, nn.Conv2d)):
# 使用正态分布初始化权重,平均值为0,标准差为self.config.initializer_range
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
# 如果存在偏置项,则初始化为零
if module.bias is not None:
module.bias.data.zero_()
# 如果模块是LayerNorm层
elif isinstance(module, nn.LayerNorm):
# 初始化偏置项为零
module.bias.data.zero_()
# 初始化权重为1
module.weight.data.fill_(1.0)
# SWINV2_START_DOCSTRING文档字符串
SWINV2_START_DOCSTRING = r"""
This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
behavior.
Parameters:
config ([`Swinv2Config`]): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
# SWINV2_INPUTS_DOCSTRING文档字符串
SWINV2_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
for details.
head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
@add_start_docstrings(
"The bare Swinv2 Model transformer outputting raw hidden-states without any specific head on top.",
SWINV2_START_DOCSTRING,
)
# 从transformers.models.swin.modeling_swin.SwinModel复制并修改为Swinv2Model,SWIN->SWINV2,Swin->Swinv2
class Swinv2Model(Swinv2PreTrainedModel):
def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
super().__init__(config)
self.config = config
# 计算模型的层数和特征维度
self.num_layers = len(config.depths)
self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1))
# 初始化嵌入层和编码器
self.embeddings = Swinv2Embeddings(config, use_mask_token=use_mask_token)
self.encoder = Swinv2Encoder(config, self.embeddings.patch_grid)
# 初始化层归一化和池化层(如果指定添加池化层)
self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
# 初始化权重并进行最终处理
self.post_init()
def get_input_embeddings(self):
# 返回输入嵌入的Patch嵌入层
return self.embeddings.patch_embeddings
def _prune_heads(self, heads_to_prune):
"""
对模型的注意力头进行剪枝。
heads_to_prune: {layer_num: 需要在该层剪枝的头列表} 参见基类PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Swinv2ModelOutput,
config_class=_CONFIG_FOR_DOC,
modality="vision",
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 输入参数详见SWINV2_INPUTS_DOCSTRING,传入像素值、布尔掩码位置、头掩码等信息
# 返回Swinv2ModelOutput类型的预期输出
) -> Union[Tuple, Swinv2ModelOutput]:
r"""
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
"""
# 根据需要设定是否输出注意力权重
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 根据需要设定是否输出隐藏状态
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 根据需要设定是否使用返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if pixel_values is None:
# 如果未提供像素值,则抛出数值错误
raise ValueError("You have to specify pixel_values")
# 准备头部遮罩(如果需要)
# head_mask 中的 1.0 表示保留该头部
# attention_probs 的形状为 bsz x n_heads x N x N
# 输入的 head_mask 形状为 [num_heads] 或 [num_hidden_layers x num_heads]
# 将 head_mask 转换为形状 [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, len(self.config.depths))
# 嵌入层输出和输入尺寸
embedding_output, input_dimensions = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
# 编码器输出
encoder_outputs = self.encoder(
embedding_output,
input_dimensions,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 序列输出
sequence_output = encoder_outputs[0]
# 序列输出进行 LayerNorm 处理
sequence_output = self.layernorm(sequence_output)
pooled_output = None
if self.pooler is not None:
# 如果存在池化器,则计算池化输出
pooled_output = self.pooler(sequence_output.transpose(1, 2))
pooled_output = torch.flatten(pooled_output, 1)
if not return_dict:
# 如果不使用返回字典形式,则返回元组形式的输出
output = (sequence_output, pooled_output) + encoder_outputs[1:]
return output
# 使用 Swinv2ModelOutput 类构建返回字典形式的输出
return Swinv2ModelOutput(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
)
@add_start_docstrings(
"""
Swinv2 Model with a decoder on top for masked image modeling, as proposed in
[SimMIM](https://arxiv.org/abs/2111.09886).
<Tip>
Note that we provide a script to pre-train this model on custom data in our [examples
directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
</Tip>
""",
SWINV2_START_DOCSTRING,
)
# 定义 Swinv2ForMaskedImageModeling 类,用于进行面向掩膜图像建模的解码器模型
# 该类基于 Swinv2PreTrainedModel,并包含了 Swinv2 模型和一个解码器
class Swinv2ForMaskedImageModeling(Swinv2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 初始化 Swinv2 模型,设置不添加池化层并使用掩膜令牌
self.swinv2 = Swinv2Model(config, add_pooling_layer=False, use_mask_token=True)
# 计算特征数量用于解码器
num_features = int(config.embed_dim * 2 ** (config.num_layers - 1))
# 定义解码器的结构
self.decoder = nn.Sequential(
nn.Conv2d(
in_channels=num_features, out_channels=config.encoder_stride**2 * config.num_channels, kernel_size=1
),
nn.PixelShuffle(config.encoder_stride),
)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Swinv2MaskedImageModelingOutput, config_class=_CONFIG_FOR_DOC)
# 重写 forward 方法,接收输入并返回 Swinv2MaskedImageModelingOutput 类型的输出
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None,
bool_masked_pos: Optional[torch.BoolTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
# 输入参数详见 SWINV2_INPUTS_DOCSTRING
@add_start_docstrings(
"""
Swinv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
of the [CLS] token) e.g. for ImageNet.
""",
SWINV2_START_DOCSTRING,
)
# 定义 Swinv2ForImageClassification 类,用于图像分类的 Swinv2 模型
# 该类基于 Swinv2PreTrainedModel,并包含了 Swinv2 模型和分类器头部
class Swinv2ForImageClassification(Swinv2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
# 设置类别数量
self.num_labels = config.num_labels
# 初始化 Swinv2 模型
self.swinv2 = Swinv2Model(config)
# 分类器头部
self.classifier = (
nn.Linear(self.swinv2.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
)
# 初始化权重并应用最终处理
self.post_init()
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_IMAGE_CLASS_CHECKPOINT,
output_type=Swinv2ImageClassifierOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
)
# 重写 forward 方法,接收输入并返回 Swinv2ImageClassifierOutput 类型的输出
def forward(
self,
pixel_values: Optional[torch.FloatTensor] = None, # 输入的像素值张量,可选
head_mask: Optional[torch.FloatTensor] = None, # 头部掩码张量,可选
labels: Optional[torch.LongTensor] = None, # 图像分类/回归的标签张量,可选
output_attentions: Optional[bool] = None, # 是否输出注意力张量,可选
output_hidden_states: Optional[bool] = None, # 是否输出隐藏状态张量,可选
return_dict: Optional[bool] = None, # 是否返回字典形式的输出,可选
) -> Union[Tuple, Swinv2ImageClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
# 确定是否使用返回字典形式的输出
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 调用Swin Transformer模型进行前向传播
outputs = self.swinv2(
pixel_values,
head_mask=head_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# 获取池化后的输出
pooled_output = outputs[1]
# 使用分类器对池化后的输出进行分类得到logits
logits = self.classifier(pooled_output)
# 初始化损失值为None
loss = None
# 如果存在标签
if labels is not None:
# 如果问题类型未定义,则根据标签数据类型和类别数目设置问题类型
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
# 根据问题类型计算损失函数
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
# 如果不使用返回字典形式的输出,则将logits与其他输出合并返回
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
# 使用Swinv2ImageClassifierOutput类封装输出并返回
return Swinv2ImageClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
reshaped_hidden_states=outputs.reshaped_hidden_states,
)
@add_start_docstrings(
"""
Swinv2 backbone, to be used with frameworks like DETR and MaskFormer.
""",
SWINV2_START_DOCSTRING,
)
class Swinv2Backbone(Swinv2PreTrainedModel, BackboneMixin):
def __init__(self, config):
# 调用父类的初始化方法,传入配置参数
super().__init__(config)
# 调用父类的_backbone初始化方法
super()._init_backbone(config)
# 计算特征维度列表,从config.embed_dim开始,按2的幂级增加,直到config.depths的长度
self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
# 初始化Swinv2的嵌入层
self.embeddings = Swinv2Embeddings(config)
# 初始化Swinv2的编码器,传入嵌入层的patch grid
self.encoder = Swinv2Encoder(config, self.embeddings.patch_grid)
# 初始化权重并应用最终处理
self.post_init()
def get_input_embeddings(self):
# 返回patch embeddings作为输入嵌入
return self.embeddings.patch_embeddings
@add_start_docstrings_to_model_forward(SWINV2_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
def forward(
self,
pixel_values: Tensor,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
"""
根据给定的参数返回 BackboneOutput 对象。
参数:
return_dict (bool, optional): 是否返回字典形式的输出,默认为使用配置中的设定。
output_hidden_states (bool, optional): 是否输出隐藏状态,默认为使用配置中的设定。
output_attentions (bool, optional): 是否输出注意力权重,默认为使用配置中的设定。
返回:
BackboneOutput: 包含特征图、隐藏状态和注意力权重的对象。
示例:
```
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> model = AutoBackbone.from_pretrained(
... "microsoft/swinv2-tiny-patch4-window8-256", out_features=["stage1", "stage2", "stage3", "stage4"]
... )
>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 2048, 7, 7]
```
"""
# 如果 return_dict 为 None,则使用配置中的设定
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# 如果 output_hidden_states 为 None,则使用配置中的设定
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
# 如果 output_attentions 为 None,则使用配置中的设定
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
# 调用 self.embeddings 处理输入像素值,获得嵌入输出和输入尺寸
embedding_output, input_dimensions = self.embeddings(pixel_values)
# 调用 self.encoder 处理嵌入输出和输入尺寸,返回输出结果
outputs = self.encoder(
embedding_output,
input_dimensions,
head_mask=None,
output_attentions=output_attentions,
output_hidden_states=True,
output_hidden_states_before_downsampling=True,
return_dict=return_dict,
)
# 根据是否返回字典决定取得隐藏状态的方式
hidden_states = outputs.reshaped_hidden_states if return_dict else outputs[-1]
# 初始化空元组用于存储特征图
feature_maps = ()
# 遍历阶段名称和隐藏状态,如果阶段在输出特征列表中,则加入特征图元组
for stage, hidden_state in zip(self.stage_names, hidden_states):
if stage in self.out_features:
feature_maps += (hidden_state,)
# 如果不返回字典形式的结果,则按照指定顺序组装输出元组
if not return_dict:
output = (feature_maps,)
if output_hidden_states:
output += (outputs[1],)
if output_attentions:
output += (outputs[2],)
return output
# 返回 BackboneOutput 对象,包含特征图、隐藏状态和注意力权重
return BackboneOutput(
feature_maps=feature_maps,
hidden_states=outputs.hidden_states if output_hidden_states else None,
attentions=outputs.attentions,
)