读书报告
一、视频基本信息
标题:《超强动画,一步一步深入浅出解释 Transformer 原理!》
链接:https://www.bilibili.com/video/BV1fGeAz6Eie?spm_id_from=333.788.videopod.episodes&vd_source=13c380961d8b8dbc19f4aa3cb11925c9&p=10
简介:2025 最新系列教程第 10 集(共 32 集),以动画形式通俗讲解 Transformer,兼顾理论与实战。
二、核心知识点
整体架构:编码器 - 解码器结构,含输入嵌入、位置编码、多头自注意力、前馈网络等模块。
关键原理:
多头自注意力:拆分 Q/K/V 并行计算,捕捉多维度语义。
位置编码:通过正弦 / 余弦函数注入语序信息。
编码器与解码器交互:解码器结合自身输出与编码器特征。
视频优势:动画可视化抽象原理,循序渐进易理解。
三、学习收获
用动画看懂注意力机制的数学逻辑,解决理论困惑。
掌握工程实现技巧(如预归一化、维度缩放),助力代码落地。
明确后续方向:跟进剩余教程,完善文本生成等实战应用。
四、代码
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
1. 位置编码模块(对应视频位置编码知识点)
class PositionalEncoding(nn.Module):
def init(self, d_model, max_len=5000):
super(PositionalEncoding, self).init()
# 生成位置编码矩阵
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
# 偶数维度用正弦,奇数维度用余弦(视频核心公式)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1) # [max_len, 1, d_model]
self.register_buffer('pe', pe) # 不参与梯度更新
def forward(self, x):
# x: [seq_len, batch_size, d_model]
x = x + self.pe[:x.size(0), :]
return x
2. 多头自注意力模块(对应视频多头注意力知识点)
class MultiHeadAttention(nn.Module):
def init(self, d_model, num_heads):
super(MultiHeadAttention, self).init()
assert d_model % num_heads == 0, "d_model必须能被num_heads整除"
self.d_k = d_model // num_heads # 每个头的维度
self.num_heads = num_heads
# 线性投影层(Q、K、V共享投影权重)
self.w_q = nn.Linear(d_model, d_model)
self.w_k = nn.Linear(d_model, d_model)
self.w_v = nn.Linear(d_model, d_model)
self.w_o = nn.Linear(d_model, d_model) # 输出融合层
def scaled_dot_product_attention(self, q, k, v, mask=None):
# 计算注意力分数:[batch_size, num_heads, seq_len_q, seq_len_k]
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
# 应用掩码(解码器掩码注意力时使用)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
# 计算注意力权重
attn_weights = F.softmax(scores, dim=-1)
# 加权求和得到输出
output = torch.matmul(attn_weights, v)
return output, attn_weights
def forward(self, q, k, v, mask=None):
batch_size = q.size(0)
# 线性投影并分多头:[batch_size, seq_len, d_model] -> [batch_size, seq_len, num_heads, d_k]
q = self.w_q(q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
k = self.w_k(k).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
v = self.w_v(v).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
# 计算自注意力
attn_output, attn_weights = self.scaled_dot_product_attention(q, k, v, mask)
# 多头输出拼接:[batch_size, num_heads, seq_len, d_k] -> [batch_size, seq_len, d_model]
attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
# 输出融合
output = self.w_o(attn_output)
return output, attn_weights
3. 前馈神经网络模块(对应视频编码器/解码器前馈网络)
class FeedForwardNetwork(nn.Module):
def init(self, d_model, d_ff=2048):
super(FeedForwardNetwork, self).init()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(0.1)
def forward(self, x):
return self.fc2(self.dropout(F.relu(self.fc1(x))))
4. 编码器层(单个编码器模块,对应视频编码器结构)
class EncoderLayer(nn.Module):
def init(self, d_model, num_heads, d_ff, dropout=0.1):
super(EncoderLayer, self).init()
self.self_attn = MultiHeadAttention(d_model, num_heads)
self.ffn = FeedForwardNetwork(d_model, d_ff)
# 层归一化
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, x, mask=None):
# 自注意力子层(预归一化结构)
attn_output, _ = self.self_attn(x, x, x, mask)
x = self.norm1(x + self.dropout1(attn_output))
# 前馈神经子网层
ffn_output = self.ffn(x)
x = self.norm2(x + self.dropout2(ffn_output))
return x
5. 简化版Transformer编码器(用于文本分类,适配视频核心架构)
class TransformerClassifier(nn.Module):
def init(self, vocab_size, d_model=512, num_heads=8, num_layers=6, d_ff=2048, num_classes=10, max_len=512, dropout=0.1):
super(TransformerClassifier, self).init()
self.d_model = d_model
# 输入嵌入层
self.embedding = nn.Embedding(vocab_size, d_model)
# 位置编码层(视频核心模块)
self.pos_encoding = PositionalEncoding(d_model, max_len)
# 编码器堆叠(6层,对应视频架构)
self.encoder_layers = nn.ModuleList([
EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
])
# 分类输出层
self.fc = nn.Linear(d_model, num_classes)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
# x: [batch_size, seq_len]
batch_size = x.size(0)
# 嵌入+位置编码:[batch_size, seq_len, d_model]
x = self.embedding(x) * math.sqrt(self.d_model) # 缩放嵌入
x = self.pos_encoding(x.transpose(0, 1)).transpose(0, 1) # 调整维度适配位置编码
x = self.dropout(x)
# 编码器堆叠前向传播
for layer in self.encoder_layers:
x = layer(x, mask)
# 取序列第一个token的输出作为分类特征(CLS token)
cls_output = x[:, 0, :]
# 分类预测
logits = self.fc(cls_output)
return logits
6. 模型测试代码
if name == "main":
# 超参数设置
vocab_size = 10000 # 词汇表大小
num_classes = 10 # 分类类别数
batch_size = 32 # 批次大小
seq_len = 50 # 序列长度
# 初始化模型
model = TransformerClassifier(
vocab_size=vocab_size,
d_model=512,
num_heads=8,
num_layers=6,
d_ff=2048,
num_classes=num_classes,
max_len=seq_len
)
# 生成模拟输入(batch_size, seq_len)
dummy_input = torch.randint(0, vocab_size, (batch_size, seq_len))
# 模型前向传播
output = model(dummy_input)
# 输出结果验证
print(f"输入形状: {dummy_input.shape}")
print(f"输出形状: {output.shape}")
print(f"模型参数总数: {sum(p.numel() for p in model.parameters()):,}")
# 打印核心模块结构
print("\nTransformer核心模块验证:")
print(f"位置编码模块: {model.pos_encoding}")
print(f"多头注意力模块: {model.encoder_layers[0].self_attn}")
print(f"前馈网络模块: {model.encoder_layers[0].ffn}")
四、总结
视频讲解通俗、重点突出,第 10 集为核心模块学习奠定基础,适合 Transformer 入门进阶。
学号后四位:3029
浙公网安备 33010602011771号