
https://github.com/Enzo-MiMan/cv_related_collections/blob/main/deep_learning_basic/self-attention/self_attention.py
import torch.nn as nn
import torch
import matplotlib.pyplot as plt
class Self_Attention(nn.Module):
def __init__(self, dim, dk, dv):
super(Self_Attention, self).__init__()
self.scale = dk ** -0.5
self.q = nn.Linear(dim, dk)
self.k = nn.Linear(dim, dk)
self.v = nn.Linear(dim, dv)
def forward(self, x):
q = self.q(x)
k = self.k(x)
v = self.v(x)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
x = attn @ v
return x
att = Self_Attention(dim=2, dk=2, dv=3)
x = torch.rand((1, 4, 2))
output = att(x)
# class MultiHead_Attention(nn.Module):
# def __init__(self, dim, num_heads):
#
# super(MultiHead_Attention, self).__init__()
# self.num_heads = num_heads # 2
# head_dim = dim // num_heads # 2
# self.scale = head_dim ** -0.5 # 1
# self.qkv = nn.Linear(dim, dim * 3)
# self.proj = nn.Linear(dim, dim)
#
# def forward(self, x):
# B, N, C = x.shape
# qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
#
# q, k, v = qkv[0], qkv[1], qkv[2]
#
# attn = (q @ k.transpose(-2, -1)) * self.scale
# attn = attn.softmax(dim=-1)
#
# x = (attn @ v).transpose(1, 2).reshape(B, N, C)
# x = self.proj(x)
# x = self.proj_drop(x)
# return x
#
# att = MultiHead_Attention(dim=768, num_heads=12)
# x = torch.rand((1, 197, 768))
# output = att(x)
https://zh.d2l.ai/chapter_attention-mechanisms/nadaraya-waston.html
from d2l import torch as d2l
import torch
from torch import nn
#@save
def show_heatmaps(matrices, xlabel, ylabel, titles=None, figsize=(2.5, 2.5),
cmap='Reds'):
"""显示矩阵热图"""
d2l.use_svg_display()
num_rows, num_cols = matrices.shape[0], matrices.shape[1]
fig, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize,
sharex=True, sharey=True, squeeze=False)
for i, (row_axes, row_matrices) in enumerate(zip(axes, matrices)):
for j, (ax, matrix) in enumerate(zip(row_axes, row_matrices)):
pcm = ax.imshow(matrix.detach().numpy(), cmap=cmap)
if i == num_rows - 1:
ax.set_xlabel(xlabel)
if j == 0:
ax.set_ylabel(ylabel)
if titles:
ax.set_title(titles[j])
fig.colorbar(pcm, ax=axes, shrink=0.6)
d2l.plt.show()
# attention_weights = torch.eye(10).reshape((1, 1, 10, 10))
# show_heatmaps(attention_weights, xlabel='Keys', ylabel='Queries')
#====================1 训练数据 ====================
n_train = 50 # 训练样本数
x_source=torch.rand(n_train) * 5 # 原始数据 #包含了从区间[0, 1)的均匀分布中抽取的一组随机数。张量的形状由参数sizes定义。 0-1 50个数 0.1-4.9
x_train, _ = torch.sort(x_source) # 排序后的训练样本
print('x_train',x_train)
def f(x):
return 2 * torch.sin(x) + x**0.8
y_train = f(x_train) + torch.normal(0.0, 0.5, (n_train,)) # 训练样本的输出
#====================2 测试数据 ====================
x_test = torch.arange(0, 5, 0.1) # 测试样本 0-5 0.1 50个数据
y_truth = f(x_test) # 测试样本的真实输出
# n_test = len(x_test) # 测试样本数
# n_test
#3 下面的函数将绘制所有的训练样本(样本由圆圈表示), 不带噪声项的真实数据生成函数
#(标记为“Truth”), 以及学习得到的预测函数(标记为“Pred”)。
def plot_kernel_reg(y_hat):
d2l.plot(x_test, [y_truth, y_hat], 'x', 'y', legend=['Truth', 'Pred'],
xlim=[0, 5], ylim=[-1, 5])
d2l.plt.plot(x_train, y_train, 'o', alpha=0.5)
d2l.plt.show()
#=======================4-1 平均汇聚
# 前面跳过了训练过程, 直接显示的给了计算代替网络预测,直接最后一步平均汇聚层
run=0
if run:
y_out =y_train.mean()# y 的输出 最后一层平均汇聚层
y_hat = torch.repeat_interleave(y_out, n_train)
# 1*n_train列输出 每一个yi 求平均后,都是均值 ,在回复维度[]
#print('y_hat',y_hat)
# 传入多维张量,默认`展平`
# >>> y = torch.tensor([[1, 2], [3, 4]])
# >>> torch.repeat_interleave(y, 2)
# tensor([1, 1, 2, 2, 3, 3, 4, 4])
# 横坐标x_test, [真值y_truth, 平均汇聚层预测y_hat],
plot_kernel_reg(y_hat)
#====================4-2 非参数注意力汇聚==========================
run=0
if run:
# 根据输入的位置对输出yi进行加权:
# X_repeat的形状:(n_test,n_train),
# 每一行都包含着相同的测试输入(例如:同样的查询)
print('x_test',x_test.shape,x_test)
# torch.Size([50])
'''
x_test torch.Size([50])
tensor([0.0000, 0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000,
0.9000, 1.0000, 1.1000, 1.2000, 1.3000, 1.4000, 1.5000, 1.6000, 1.7000,
1.8000, 1.9000, 2.0000, 2.1000, 2.2000, 2.3000, 2.4000, 2.5000, 2.6000,
2.7000, 2.8000, 2.9000, 3.0000, 3.1000, 3.2000, 3.3000, 3.4000, 3.5000,
3.6000, 3.7000, 3.8000, 3.9000, 4.0000, 4.1000, 4.2000, 4.3000, 4.4000,
4.5000, 4.6000, 4.7000, 4.8000, 4.9000])
'''
#print('x_test.repeat_interleave(n_train)',x_test.repeat_interleave(n_train).shape,x_test.repeat_interleave(n_train))
# torch.Size([2500])
# 构造查询表
X_repeat = x_test.repeat_interleave(n_train).reshape((-1, n_train))
print('X_repeat',X_repeat.shape,X_repeat)
# torch.Size([50, 50])
'''
X_repeat
torch.Size([50, 50])
tensor([[0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],
[0.1000, 0.1000, 0.1000, ..., 0.1000, 0.1000, 0.1000],
[0.2000, 0.2000, 0.2000, ..., 0.2000, 0.2000, 0.2000],
...,
[4.7000, 4.7000, 4.7000, ..., 4.7000, 4.7000, 4.7000],
[4.8000, 4.8000, 4.8000, ..., 4.8000, 4.8000, 4.8000],
[4.9000, 4.9000, 4.9000, ..., 4.9000, 4.9000, 4.9000]])
x_train
torch.Size([1, 50])
tensor([0.1249, 0.2723, 0.3242, 0.3747, 0.6435, 0.7526, 0.7749, 0.9694, 0.9709,
1.1660, 1.3965, 1.4592, 1.5059, 1.6240, 1.6567, 1.9198, 1.9289, 1.9650,
1.9665, 2.0000, 2.0822, 2.1460, 2.2586, 2.2702, 2.3153, 2.4764, 2.6111,
2.6732, 2.9376, 3.1270, 3.2933, 3.3839, 3.3909, 3.4030, 3.4695, 3.6524,
3.6915, 3.7456, 3.8196, 3.8434, 3.8556, 3.9236, 4.2003, 4.2841, 4.2882,
4.5061, 4.5877, 4.6141, 4.7991, 4.8649])
'''
# x_train包含着键。attention_weights的形状:(n_test,n_train),
# 每一行都包含着要在给定的每个查询的值(y_train)之间分配的注意力权重
# x_train [0,0.1]
print((X_repeat - x_train).shape)#torch.Size([50, 50])
# X_repeat 查询表
# x_train 原始数据
# 位置权重
attention_weights = nn.functional.softmax(-(X_repeat - x_train)**2 / 2, dim=1)
#
# 键 y_hat的每个元素都是值的加权平均值,其中的权重是注意力权重
y_hat = torch.matmul(attention_weights, y_train)
plot_kernel_reg(y_hat)
print('y_hat',y_hat.shape,y_hat)
#y_hat torch.Size([50])
show_heatmaps(attention_weights.unsqueeze(0).unsqueeze(0),
xlabel='Sorted training inputs',
ylabel='Sorted testing inputs')
#====================4-3 带参数注意力汇聚==========================
i=0
class NWKernelRegression(nn.Module):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.w = nn.Parameter(torch.rand((1,), requires_grad=True))
'''
queries=x_train 样本数目*1*50
keys 50*50-1
values 50*50-1
'''
def forward(self, queries, keys, values):
if i==0:
pass
#i=i+1
#print('原始 queries.shape',queries.shape,queries)
#print('keys.shape',keys.shape,keys)
# 50*49
else:
pass
# 50*50
# queries和attention_weights的形状为(查询个数,“键-值”对个数)
'''
原数据 queries [1,50]
变换后 queries [50,50]
'''
queries = queries.repeat_interleave(keys.shape[1]).reshape((-1, keys.shape[1]))
#print('变换 queries.shape',queries.shape,queries)
'''
变换 queries.shape
torch.Size([50, 50])
tensor([[0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],
[0.1000, 0.1000, 0.1000, ..., 0.1000, 0.1000, 0.1000],
[0.2000, 0.2000, 0.2000, ..., 0.2000, 0.2000, 0.2000],
...,
[4.7000, 4.7000, 4.7000, ..., 4.7000, 4.7000, 4.7000],
[4.8000, 4.8000, 4.8000, ..., 4.8000, 4.8000, 4.8000],
[4.9000, 4.9000, 4.9000, ..., 4.9000, 4.9000, 4.9000]])
keys.shape
torch.Size([50, 50])
tensor([[0.0456, 0.1142, 0.2446, ..., 4.7166, 4.8794, 4.9157],
[0.0456, 0.1142, 0.2446, ..., 4.7166, 4.8794, 4.9157],
[0.0456, 0.1142, 0.2446, ..., 4.7166, 4.8794, 4.9157],
...,
[0.0456, 0.1142, 0.2446, ..., 4.7166, 4.8794, 4.9157],
[0.0456, 0.1142, 0.2446, ..., 4.7166, 4.8794, 4.9157],
[0.0456, 0.1142, 0.2446, ..., 4.7166, 4.8794, 4.9157]])
'''
self.attention_weights = nn.functional.softmax(-((queries - keys) * self.w)**2 / 2, dim=1)
#print("self.attention_weights ",self.attention_weights.shape,self.attention_weights)
'''
self.attention_weights
torch.Size([50, 50])
tensor([[4.1580e-01, 3.9295e-01, 1.5210e-01, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00],
[1.3102e-01, 1.7392e-01, 3.4801e-01, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00],
[4.8966e-03, 9.1298e-03, 9.4441e-02, ..., 0.0000e+00, 0.0000e+00, 0.0000e+00],
...,
[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 3.3589e-01, 1.2297e-01, 3.2375e-05],
[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 4.2061e-01, 4.8211e-01, 8.1415e-03],
[0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 1.1759e-01, 4.2199e-01, 4.5708e-01]],
grad_fn=<SoftmaxBackward>)
'''
# values的形状为(查询个数,“键-值”对个数)
'''
计算两个tensor的矩阵乘法,torch.bmm(a,b),
tensor a 的size为(b,h,w),
tensor b的size为(b,w,m)
也就是说两个tensor的第一维是相等的,然后第一个数组的第三维和第二个数组的第二维度要求一样,对于剩下的则不做要求,输出维度 (b,h,m)
values [50,50]
'''
#squeeze(a)就是将a中所有为1的维度删掉。
y_all= torch.bmm(self.attention_weights.unsqueeze(1),values.unsqueeze(-1)).reshape(-1)
print("y 预测",y_all.shape,y_all)
'''
y_all
torch.Size([50])
y_all tensor([0.7118, 0.7288, 0.7530, 0.7947, 0.8750, 1.0298, 1.2897, 1.6270, 1.9577,
2.2429, 2.4977, 2.7139, 2.8639, 2.9507, 3.0031, 3.0505, 3.1153, 3.2099,
3.3269, 3.4348, 3.4992, 3.5058, 3.4644, 3.3975, 3.3216, 3.2334, 3.1148, 预测结果最大
2.9579, 2.7886, 2.6518, 2.5684, 2.5253, 2.4968, 2.4572, 2.3833, 2.2582,
2.0832, 1.8848, 1.6988, 1.5471, 1.4331, 1.3567, 1.3277, 1.3641, 1.4558,
1.5410, 1.5699, 1.5445, 1.4873, 1.4159], grad_fn=<ViewBackward>)
'''
return y_all
# 1 数据初始化
# X_tile的形状:(n_train,n_train),每一行都包含着相同的训练输入
X_tile = x_train.repeat((n_train, 1))
print('X_tile',X_tile.shape,X_tile)
'''
X_tile
torch.Size([50, 50])
tensor([[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
...,
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565]])
'''
# Y_tile的形状:(n_train,n_train),每一行都包含着相同的训练输出
Y_tile = y_train.repeat((n_train, 1))
'''
torch.eye(3)
tensor([[ 1., 0., 0.],
[ 0., 1., 0.],
[ 0., 0., 1.]])
'''
x_f=(1 - torch.eye(n_train)).type(torch.bool)
'''
x_f
torch.Size([50, 50])
tensor([[False, True, True, ..., True, True, True],
[ True, False, True, ..., True, True, True],
[ True, True, False, ..., True, True, True],
...,
[ True, True, True, ..., False, True, True],
[ True, True, True, ..., True, False, True],
[ True, True, True, ..., True, True, False]])
'''
print('x_f',x_f.shape,x_f)
# keys的形状:('n_train','n_train'-1) 返回一个二维张量,对角线上为 1,其他位置为 0。#
#任何一个训练样本的输入都会和除自己以外的所有训练样本的“键-值”对进行计算, 从而得到其对应的预测输出。
keys = X_tile[x_f].reshape((n_train, -1))
print('keys',keys.shape,keys)
'''
X_tile
torch.Size([50, 50])
tensor([[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
...,
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.8935, 4.9268, 4.9565]])
'''
#任何一个训练样本的输入都会和除自己以外的所有训练样本的“键-值”对进行计算, 从而得到其对应的预测输出。
'''
torch.Size([50, 49])
tensor([[0.0979, 0.2568, 0.2891, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.2568, 0.2891, ..., 4.8935, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2891, ..., 4.8935, 4.9268, 4.9565],
...,
[0.0618, 0.0979, 0.2568, ..., 4.7557, 4.9268, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.7557, 4.8935, 4.9565],
[0.0618, 0.0979, 0.2568, ..., 4.7557, 4.8935, 4.9268]])
'''
# values的形状:('n_train','n_train'-1)
values = Y_tile[(1 - torch.eye(n_train)).type(torch.bool)].reshape((n_train, -1))
print('values',values.shape,values)
# 2 创建模型
net = NWKernelRegression()
# 3 创建损失
loss = nn.MSELoss(reduction='none')
# 4 更新迭代器
trainer = torch.optim.SGD(net.parameters(), lr=0.5)
# 画图
animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, 5])
for epoch in range(5):
trainer.zero_grad()
'''
x_train 样本数目*1*50
keys 50*50-1
values 50*50-1
'''
y_predict=net(x_train, keys, values)
l = loss(y_predict, y_train)
l.sum().backward()
trainer.step()
print(f'epoch {epoch + 1}, loss {float(l.sum()):.6f}')
animator.add(epoch + 1, float(l.sum()))
##########+=======================测试=============
#0-1 测试真值x
x_test = torch.arange(0, 5, 0.1) # 测试样本 0-5 0.1 50个数据
'''
x_test 1*50 [0.1,0.2,0.3...4.9,5.0]
'''
#0-2 测试真值y
y_truth = f(x_test) # 测试样本的真实输出
# 1 使用训练数据 构建 查询键和值
n_test=n_train
# keys的形状:(n_test,n_train),每一行包含着相同的训练输入(例如,相同的键)
print('x_train',x_train.shape,x_train)
'''
x_train torch.Size([50])
tensor([0.1321, 0.4468, 0.4907, 0.5023, 0.5911, 0.6308, 0.7619, 0.9824, 0.9841,
1.1427, 1.1590, 1.2575, 1.2678, 1.2939, 1.8171, 1.9704, 1.9845, 2.0308,
2.1194, 2.1260, 2.4450, 2.4946, 2.5947, 2.6076, 2.8287, 2.8463, 3.0713,
3.0994, 3.1098, 3.3187, 3.5441, 3.5758, 3.6766, 3.7267, 3.8284, 4.0710,
4.0790, 4.1060, 4.1062, 4.2637, 4.3664, 4.4939, 4.5054, 4.6789, 4.7355,
4.7434, 4.8369, 4.9438, 4.9527, 4.9534])
'''
# 1-2 训练数据构建 键
keys = x_train.repeat((n_test, 1)) # h行数不动 列扩展 50个数据 列 拷贝50列
print('给定的查询 keys',keys.shape,keys)
'''
x_train 1X50 [x1,x2,..,x50]
keys 50X50[
[x1,x2,...x50]
[x1,x2,...x50]
... 50个
[x1,x2,...x50]
]
tensor([[0.1321, 0.4468, 0.4907, ..., 4.9438, 4.9527, 4.9534],
[0.1321, 0.4468, 0.4907, ..., 4.9438, 4.9527, 4.9534],
[0.1321, 0.4468, 0.4907, ..., 4.9438, 4.9527, 4.9534],
...,
[0.1321, 0.4468, 0.4907, ..., 4.9438, 4.9527, 4.9534],
[0.1321, 0.4468, 0.4907, ..., 4.9438, 4.9527, 4.9534],
[0.1321, 0.4468, 0.4907, ..., 4.9438, 4.9527, 4.9534]])
'''
# 1-3 训练数据构建 值
# value的形状:(n_test,n_train)
values = y_train.repeat((n_test, 1))
print('给定的查询 values',values.shape,values)
'''
values[
[y1,y2,...,y50]
[y1,y2,...,y50]
...50个
[y1,y2,...,y50]
]
values torch.Size([50, 50])
tensor([[1.2841, 0.6112, 0.5891, ..., 1.5115, 1.7727, 1.9353],
[1.2841, 0.6112, 0.5891, ..., 1.5115, 1.7727, 1.9353],
[1.2841, 0.6112, 0.5891, ..., 1.5115, 1.7727, 1.9353],
...,
[1.2841, 0.6112, 0.5891, ..., 1.5115, 1.7727, 1.9353],
[1.2841, 0.6112, 0.5891, ..., 1.5115, 1.7727, 1.9353],
[1.2841, 0.6112, 0.5891, ..., 1.5115, 1.7727, 1.9353]])
'''
y_t= net(x_test, keys, values)
print('y_t',y_t.shape,y_t)
'''
y_t torch.Size([50]) tensor([0.4223, 0.4960,...,1.6023])
y_hat torch.Size([50, 1]) tensor([[0.4223],[0.4960],...,[1.6023]])
'''
y_hat = y_t.unsqueeze(1).detach()
print('y_hat',y_hat.shape,y_hat)
plot_kernel_reg(y_hat)
#show_heatmaps(net.attention_weights.unsqueeze(0).unsqueeze(0),xlabel='Sorted training inputs',ylabel='Sorted testing inputs')
浙公网安备 33010602011771号