• 博客园logo
  • 会员
  • 众包
  • 新闻
  • 博问
  • 闪存
  • 赞助商
  • HarmonyOS
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录
MKT-porter
博客园    首页    新随笔    联系   管理    订阅  订阅
pytorch(10.2.2) 注意力汇聚理论 代码测试

 

https://github.com/Enzo-MiMan/cv_related_collections/blob/main/deep_learning_basic/self-attention/self_attention.py

 

import torch.nn as nn
import torch
import matplotlib.pyplot as plt


class Self_Attention(nn.Module):
    def __init__(self, dim, dk, dv):
        super(Self_Attention, self).__init__()
        self.scale = dk ** -0.5
        self.q = nn.Linear(dim, dk)
        self.k = nn.Linear(dim, dk)
        self.v = nn.Linear(dim, dv)


    def forward(self, x):
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)

        x = attn @ v
        return x


att = Self_Attention(dim=2, dk=2, dv=3)
x = torch.rand((1, 4, 2))
output = att(x)
















# class MultiHead_Attention(nn.Module):
#     def __init__(self, dim, num_heads):
#
#         super(MultiHead_Attention, self).__init__()
#         self.num_heads = num_heads   # 2
#         head_dim = dim // num_heads   # 2
#         self.scale = head_dim ** -0.5   # 1
#         self.qkv = nn.Linear(dim, dim * 3)
#         self.proj = nn.Linear(dim, dim)
#
#     def forward(self, x):
#         B, N, C = x.shape
#         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
#
#         q, k, v = qkv[0], qkv[1], qkv[2]
#
#         attn = (q @ k.transpose(-2, -1)) * self.scale
#         attn = attn.softmax(dim=-1)
#
#         x = (attn @ v).transpose(1, 2).reshape(B, N, C)
#         x = self.proj(x)
#         x = self.proj_drop(x)
#         return x
#
# att = MultiHead_Attention(dim=768, num_heads=12)
# x = torch.rand((1, 197, 768))
# output = att(x)

  

https://zh.d2l.ai/chapter_attention-mechanisms/nadaraya-waston.html

 

from d2l import torch as d2l
import torch
from torch import nn


#@save
def show_heatmaps(matrices, xlabel, ylabel, titles=None, figsize=(2.5, 2.5),
                  cmap='Reds'):
    """显示矩阵热图"""
    d2l.use_svg_display()
    num_rows, num_cols = matrices.shape[0], matrices.shape[1]
    fig, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize,
                                 sharex=True, sharey=True, squeeze=False)
    for i, (row_axes, row_matrices) in enumerate(zip(axes, matrices)):
        for j, (ax, matrix) in enumerate(zip(row_axes, row_matrices)):
            pcm = ax.imshow(matrix.detach().numpy(), cmap=cmap)
            if i == num_rows - 1:
                ax.set_xlabel(xlabel)
            if j == 0:
                ax.set_ylabel(ylabel)
            if titles:
                ax.set_title(titles[j])
    fig.colorbar(pcm, ax=axes, shrink=0.6)
    d2l.plt.show()


# attention_weights = torch.eye(10).reshape((1, 1, 10, 10))
# show_heatmaps(attention_weights, xlabel='Keys', ylabel='Queries')


#====================1 训练数据 ====================

n_train = 50  # 训练样本数
x_source=torch.rand(n_train) * 5  # 原始数据  #包含了从区间[0, 1)的均匀分布中抽取的一组随机数。张量的形状由参数sizes定义。 0-1 50个数 0.1-4.9
x_train, _ = torch.sort(x_source)   # 排序后的训练样本
print('x_train',x_train)

def f(x):
    return 2 * torch.sin(x) + x**0.8

y_train = f(x_train) + torch.normal(0.0, 0.5, (n_train,))  # 训练样本的输出

#====================2 测试数据 ====================
x_test = torch.arange(0, 5, 0.1)  # 测试样本 0-5 0.1  50个数据
y_truth = f(x_test)     # 测试样本的真实输出
# n_test = len(x_test)  # 测试样本数
# n_test


#3 下面的函数将绘制所有的训练样本(样本由圆圈表示), 不带噪声项的真实数据生成函数
#(标记为“Truth”), 以及学习得到的预测函数(标记为“Pred”)。
def plot_kernel_reg(y_hat):
    d2l.plot(x_test, [y_truth, y_hat], 'x', 'y', legend=['Truth', 'Pred'],
             xlim=[0, 5], ylim=[-1, 5])
    d2l.plt.plot(x_train, y_train, 'o', alpha=0.5)
    d2l.plt.show()

#=======================4-1 平均汇聚
# 前面跳过了训练过程, 直接显示的给了计算代替网络预测,直接最后一步平均汇聚层

run=0
if run:
    y_out =y_train.mean()# y 的输出 最后一层平均汇聚层

    y_hat = torch.repeat_interleave(y_out, n_train)
    # 1*n_train列输出  每一个yi  求平均后,都是均值 ,在回复维度[]

    #print('y_hat',y_hat)
    # 传入多维张量,默认`展平`
    # >>> y = torch.tensor([[1, 2], [3, 4]])
    # >>> torch.repeat_interleave(y, 2)
    # tensor([1, 1, 2, 2, 3, 3, 4, 4])
    
    # 横坐标x_test, [真值y_truth, 平均汇聚层预测y_hat],
    plot_kernel_reg(y_hat)

#====================4-2 非参数注意力汇聚==========================

run=0
if run:
    # 根据输入的位置对输出yi进行加权:
    # X_repeat的形状:(n_test,n_train),
    # 每一行都包含着相同的测试输入(例如:同样的查询)
    print('x_test',x_test.shape,x_test)
    # torch.Size([50])
    '''
    x_test torch.Size([50]) 
    tensor([0.0000, 0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000,
            0.9000, 1.0000, 1.1000, 1.2000, 1.3000, 1.4000, 1.5000, 1.6000, 1.7000,
            1.8000, 1.9000, 2.0000, 2.1000, 2.2000, 2.3000, 2.4000, 2.5000, 2.6000,
            2.7000, 2.8000, 2.9000, 3.0000, 3.1000, 3.2000, 3.3000, 3.4000, 3.5000,
            3.6000, 3.7000, 3.8000, 3.9000, 4.0000, 4.1000, 4.2000, 4.3000, 4.4000,
            4.5000, 4.6000, 4.7000, 4.8000, 4.9000])
    '''
    #print('x_test.repeat_interleave(n_train)',x_test.repeat_interleave(n_train).shape,x_test.repeat_interleave(n_train))
    # torch.Size([2500])
    # 构造查询表
    X_repeat = x_test.repeat_interleave(n_train).reshape((-1, n_train))
    print('X_repeat',X_repeat.shape,X_repeat)
    # torch.Size([50, 50])
    '''
    X_repeat
    torch.Size([50, 50])
    tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
            [0.1000, 0.1000, 0.1000,  ..., 0.1000, 0.1000, 0.1000],
            [0.2000, 0.2000, 0.2000,  ..., 0.2000, 0.2000, 0.2000],
            ...,
            [4.7000, 4.7000, 4.7000,  ..., 4.7000, 4.7000, 4.7000],
            [4.8000, 4.8000, 4.8000,  ..., 4.8000, 4.8000, 4.8000],
            [4.9000, 4.9000, 4.9000,  ..., 4.9000, 4.9000, 4.9000]])

    x_train
    torch.Size([1, 50])
    tensor([0.1249, 0.2723, 0.3242, 0.3747, 0.6435, 0.7526, 0.7749, 0.9694, 0.9709,
            1.1660, 1.3965, 1.4592, 1.5059, 1.6240, 1.6567, 1.9198, 1.9289, 1.9650,
            1.9665, 2.0000, 2.0822, 2.1460, 2.2586, 2.2702, 2.3153, 2.4764, 2.6111,
            2.6732, 2.9376, 3.1270, 3.2933, 3.3839, 3.3909, 3.4030, 3.4695, 3.6524,
            3.6915, 3.7456, 3.8196, 3.8434, 3.8556, 3.9236, 4.2003, 4.2841, 4.2882,
            4.5061, 4.5877, 4.6141, 4.7991, 4.8649])

    '''

    # x_train包含着键。attention_weights的形状:(n_test,n_train),
    # 每一行都包含着要在给定的每个查询的值(y_train)之间分配的注意力权重
    # x_train [0,0.1]
    print((X_repeat - x_train).shape)#torch.Size([50, 50])
    # X_repeat 查询表
    # x_train  原始数据
    # 位置权重
    attention_weights = nn.functional.softmax(-(X_repeat - x_train)**2 / 2, dim=1)
    # 
    # 键 y_hat的每个元素都是值的加权平均值,其中的权重是注意力权重
    y_hat = torch.matmul(attention_weights, y_train)
    plot_kernel_reg(y_hat)
    print('y_hat',y_hat.shape,y_hat)
    #y_hat torch.Size([50])

    show_heatmaps(attention_weights.unsqueeze(0).unsqueeze(0),
                    xlabel='Sorted training inputs',
                    ylabel='Sorted testing inputs')



#====================4-3 带参数注意力汇聚==========================

i=0 

class NWKernelRegression(nn.Module):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.w = nn.Parameter(torch.rand((1,), requires_grad=True))


    '''
    queries=x_train 样本数目*1*50 
    keys    50*50-1
    values  50*50-1
    
    '''
    def forward(self, queries, keys, values):
        if i==0:
            pass
            #i=i+1
            #print('原始 queries.shape',queries.shape,queries)
            #print('keys.shape',keys.shape,keys)
            # 50*49
        else:
            pass
            # 50*50

        # queries和attention_weights的形状为(查询个数,“键-值”对个数)
        '''
        原数据 queries [1,50]
        变换后 queries  [50,50]
        '''
        queries = queries.repeat_interleave(keys.shape[1]).reshape((-1, keys.shape[1]))

        #print('变换 queries.shape',queries.shape,queries)
        '''
        变换 queries.shape 
        torch.Size([50, 50]) 
        tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
                [0.1000, 0.1000, 0.1000,  ..., 0.1000, 0.1000, 0.1000],
                [0.2000, 0.2000, 0.2000,  ..., 0.2000, 0.2000, 0.2000],
                ...,
                [4.7000, 4.7000, 4.7000,  ..., 4.7000, 4.7000, 4.7000],
                [4.8000, 4.8000, 4.8000,  ..., 4.8000, 4.8000, 4.8000],
                [4.9000, 4.9000, 4.9000,  ..., 4.9000, 4.9000, 4.9000]])

        keys.shape 
        torch.Size([50, 50]) 
        tensor([[0.0456, 0.1142, 0.2446,  ..., 4.7166, 4.8794, 4.9157],
                [0.0456, 0.1142, 0.2446,  ..., 4.7166, 4.8794, 4.9157],
                [0.0456, 0.1142, 0.2446,  ..., 4.7166, 4.8794, 4.9157],
                ...,
                [0.0456, 0.1142, 0.2446,  ..., 4.7166, 4.8794, 4.9157],
                [0.0456, 0.1142, 0.2446,  ..., 4.7166, 4.8794, 4.9157],
                [0.0456, 0.1142, 0.2446,  ..., 4.7166, 4.8794, 4.9157]])
        '''


        
        self.attention_weights = nn.functional.softmax(-((queries - keys) * self.w)**2 / 2, dim=1)
        #print("self.attention_weights ",self.attention_weights.shape,self.attention_weights)
        '''
        self.attention_weights  
        torch.Size([50, 50]) 
        tensor([[4.1580e-01, 3.9295e-01, 1.5210e-01,  ..., 0.0000e+00, 0.0000e+00, 0.0000e+00],
                [1.3102e-01, 1.7392e-01, 3.4801e-01,  ..., 0.0000e+00, 0.0000e+00, 0.0000e+00],
                [4.8966e-03, 9.1298e-03, 9.4441e-02,  ..., 0.0000e+00, 0.0000e+00, 0.0000e+00],
                ...,
                [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.3589e-01, 1.2297e-01, 3.2375e-05],
                [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 4.2061e-01, 4.8211e-01, 8.1415e-03],
                [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.1759e-01, 4.2199e-01, 4.5708e-01]], 
                grad_fn=<SoftmaxBackward>)
        '''
        # values的形状为(查询个数,“键-值”对个数)
        '''
        计算两个tensor的矩阵乘法,torch.bmm(a,b),
        tensor a 的size为(b,h,w),
        tensor b的size为(b,w,m) 
        也就是说两个tensor的第一维是相等的,然后第一个数组的第三维和第二个数组的第二维度要求一样,对于剩下的则不做要求,输出维度 (b,h,m)
        

        values [50,50]
        '''
        #squeeze(a)就是将a中所有为1的维度删掉。
        y_all= torch.bmm(self.attention_weights.unsqueeze(1),values.unsqueeze(-1)).reshape(-1)
        print("y 预测",y_all.shape,y_all)

        '''
        y_all
        torch.Size([50])
        y_all   tensor([0.7118, 0.7288, 0.7530, 0.7947, 0.8750, 1.0298, 1.2897, 1.6270, 1.9577,
                        2.2429, 2.4977, 2.7139, 2.8639, 2.9507, 3.0031, 3.0505, 3.1153, 3.2099,
                        3.3269, 3.4348, 3.4992, 3.5058, 3.4644, 3.3975, 3.3216, 3.2334, 3.1148,   预测结果最大
                        2.9579, 2.7886, 2.6518, 2.5684, 2.5253, 2.4968, 2.4572, 2.3833, 2.2582,
                        2.0832, 1.8848, 1.6988, 1.5471, 1.4331, 1.3567, 1.3277, 1.3641, 1.4558,
                        1.5410, 1.5699, 1.5445, 1.4873, 1.4159], grad_fn=<ViewBackward>)
        '''
        return y_all
    
# 1 数据初始化
# X_tile的形状:(n_train,n_train),每一行都包含着相同的训练输入
X_tile = x_train.repeat((n_train, 1))
print('X_tile',X_tile.shape,X_tile)
'''
X_tile 
torch.Size([50, 50]) 
tensor([[0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        ...,
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565]])
'''
# Y_tile的形状:(n_train,n_train),每一行都包含着相同的训练输出
Y_tile = y_train.repeat((n_train, 1))
'''
torch.eye(3)
tensor([[ 1.,  0.,  0.],
        [ 0.,  1.,  0.],
        [ 0.,  0.,  1.]])
'''
x_f=(1 - torch.eye(n_train)).type(torch.bool)
'''
x_f 
torch.Size([50, 50]) 
tensor([[False,  True,  True,  ...,  True,  True,  True],
        [ True, False,  True,  ...,  True,  True,  True],
        [ True,  True, False,  ...,  True,  True,  True],
        ...,
        [ True,  True,  True,  ..., False,  True,  True],
        [ True,  True,  True,  ...,  True, False,  True],
        [ True,  True,  True,  ...,  True,  True, False]])
'''
print('x_f',x_f.shape,x_f)
# keys的形状:('n_train','n_train'-1) 返回一个二维张量,对角线上为 1,其他位置为 0。#
#任何一个训练样本的输入都会和除自己以外的所有训练样本的“键-值”对进行计算, 从而得到其对应的预测输出。
keys = X_tile[x_f].reshape((n_train, -1))
print('keys',keys.shape,keys)
'''
X_tile 
torch.Size([50, 50]) 
tensor([[0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        ...,
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.8935, 4.9268, 4.9565]])
'''
#任何一个训练样本的输入都会和除自己以外的所有训练样本的“键-值”对进行计算, 从而得到其对应的预测输出。
'''
torch.Size([50, 49])
tensor([[0.0979, 0.2568, 0.2891,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.2568, 0.2891,  ..., 4.8935, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2891,  ..., 4.8935, 4.9268, 4.9565],
        ...,
        [0.0618, 0.0979, 0.2568,  ..., 4.7557, 4.9268, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.7557, 4.8935, 4.9565],
        [0.0618, 0.0979, 0.2568,  ..., 4.7557, 4.8935, 4.9268]])

'''

# values的形状:('n_train','n_train'-1)
values = Y_tile[(1 - torch.eye(n_train)).type(torch.bool)].reshape((n_train, -1))
print('values',values.shape,values)

# 2 创建模型
net = NWKernelRegression()
# 3 创建损失
loss = nn.MSELoss(reduction='none')
# 4 更新迭代器
trainer = torch.optim.SGD(net.parameters(), lr=0.5)
# 画图
animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, 5])

for epoch in range(5):
    trainer.zero_grad()
    '''
    x_train 样本数目*1*50 
    keys    50*50-1
    values  50*50-1
    
    '''
    y_predict=net(x_train, keys, values)
    l = loss(y_predict, y_train)
    l.sum().backward()
    trainer.step()
    print(f'epoch {epoch + 1}, loss {float(l.sum()):.6f}')
    animator.add(epoch + 1, float(l.sum()))

##########+=======================测试=============
#0-1 测试真值x
x_test = torch.arange(0, 5, 0.1)  # 测试样本 0-5 0.1  50个数据
'''
x_test 1*50 [0.1,0.2,0.3...4.9,5.0]  

'''
#0-2 测试真值y
y_truth = f(x_test)     # 测试样本的真实输出

# 1 使用训练数据 构建 查询键和值
n_test=n_train
# keys的形状:(n_test,n_train),每一行包含着相同的训练输入(例如,相同的键)
print('x_train',x_train.shape,x_train)

'''
x_train torch.Size([50]) 
tensor([0.1321, 0.4468, 0.4907, 0.5023, 0.5911, 0.6308, 0.7619, 0.9824, 0.9841,
        1.1427, 1.1590, 1.2575, 1.2678, 1.2939, 1.8171, 1.9704, 1.9845, 2.0308,
        2.1194, 2.1260, 2.4450, 2.4946, 2.5947, 2.6076, 2.8287, 2.8463, 3.0713,
        3.0994, 3.1098, 3.3187, 3.5441, 3.5758, 3.6766, 3.7267, 3.8284, 4.0710,
        4.0790, 4.1060, 4.1062, 4.2637, 4.3664, 4.4939, 4.5054, 4.6789, 4.7355,
        4.7434, 4.8369, 4.9438, 4.9527, 4.9534])
'''
# 1-2 训练数据构建 键 
keys = x_train.repeat((n_test, 1)) # h行数不动  列扩展  50个数据  列 拷贝50列
print('给定的查询 keys',keys.shape,keys)

'''

x_train 1X50 [x1,x2,..,x50]



keys 50X50[
    [x1,x2,...x50]  
    [x1,x2,...x50]
    ... 50个
    [x1,x2,...x50]
    ]

tensor([[0.1321, 0.4468, 0.4907,  ..., 4.9438, 4.9527, 4.9534],
        [0.1321, 0.4468, 0.4907,  ..., 4.9438, 4.9527, 4.9534],
        [0.1321, 0.4468, 0.4907,  ..., 4.9438, 4.9527, 4.9534],
        ...,
        [0.1321, 0.4468, 0.4907,  ..., 4.9438, 4.9527, 4.9534],
        [0.1321, 0.4468, 0.4907,  ..., 4.9438, 4.9527, 4.9534],
        [0.1321, 0.4468, 0.4907,  ..., 4.9438, 4.9527, 4.9534]])

'''

# 1-3 训练数据构建 值
# value的形状:(n_test,n_train)
values = y_train.repeat((n_test, 1))
print('给定的查询 values',values.shape,values)

'''
values[
      [y1,y2,...,y50]
      [y1,y2,...,y50]
      ...50个
      [y1,y2,...,y50]
      ]


values torch.Size([50, 50]) 
 tensor([[1.2841, 0.6112, 0.5891,  ..., 1.5115, 1.7727, 1.9353],
        [1.2841, 0.6112, 0.5891,  ..., 1.5115, 1.7727, 1.9353],
        [1.2841, 0.6112, 0.5891,  ..., 1.5115, 1.7727, 1.9353],
        ...,
        [1.2841, 0.6112, 0.5891,  ..., 1.5115, 1.7727, 1.9353],
        [1.2841, 0.6112, 0.5891,  ..., 1.5115, 1.7727, 1.9353],
        [1.2841, 0.6112, 0.5891,  ..., 1.5115, 1.7727, 1.9353]])
'''


y_t= net(x_test, keys, values)
print('y_t',y_t.shape,y_t)
'''
y_t torch.Size([50]) tensor([0.4223, 0.4960,...,1.6023])
y_hat torch.Size([50, 1]) tensor([[0.4223],[0.4960],...,[1.6023]])
'''
y_hat = y_t.unsqueeze(1).detach()
print('y_hat',y_hat.shape,y_hat)
plot_kernel_reg(y_hat)


#show_heatmaps(net.attention_weights.unsqueeze(0).unsqueeze(0),xlabel='Sorted training inputs',ylabel='Sorted testing inputs')

  

posted on 2023-10-17 15:43  MKT-porter  阅读(55)  评论(0)    收藏  举报
刷新页面返回顶部
博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3