transform

                           

模型结构

transform模型结构由以下几个部分组成:

  • 左边的解码部分:由多个encoder结构堆叠而成,输入src emb 和 position emb的和,输出编码后结果memory
  • 右边的编码部分:由多个decoder结构堆叠而成,输入tgt emb 和 position emb的和以及编码结果memory,,输出编码后的结果
  • ffn + softmax:把解码后的最后一个位置的结果映射成next token的预估概率
class Transform(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, d_model, tgt_vocab, N):
        super(Transform, self).__init__()
        self.encoders = nn.ModuleList([copy.deepcopy(encoder) for _ in range(N)])
        self.decoders = nn.ModuleList([copy.deepcopy(decoder) for _ in range(N)])
        self.proj = nn.Linear(d_model, tgt_vocab) # 线性投影层,将 d_model 维度的向量映射到词汇表大小的维度
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.d_model = d_model
        
        self.max_len=5000 # 编码最大长度
        pe = torch.zeros(self.max_len, d_model) # 初始化位置编码矩阵
        position = torch.arange(0, self.max_len).unsqueeze(1) # 位置索引
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model) # 计算频率
        )
        pe[:, 0::2] = torch.sin(position * div_term) # 偶数位置使用正弦函数编码
        pe[:, 1::2] = torch.cos(position * div_term) # 奇数位置使用余弦函数编码
        pe = pe.unsqueeze(0) # 增加一个维度,用于批次处理
        self.register_buffer("pe", pe) # 将位置编码矩阵注册为缓冲区,不参与模型的训练

    def positional_encoding(self, x):
        return x + self.pe[:, : x.size(1)].requires_grad_(False)
    
    def encode(self, src):
        x = self.positional_encoding(self.src_embed(src))
        for encoder in self.encoders:
            x = encoder(x)
        return x

    def decode(self, memory, tgt, mask):
        x = self.positional_encoding(self.tgt_embed(tgt))
        for decoder in self.decoders:
            x = decoder(x, memory, mask)
        return x

    def generator(self, x):
        return log_softmax(self.proj(x), dim=-1)

    def forward(self, src, tgt, mask):
        x = self.positional_encoding(self.src_embed(src))
        for encoder in self.encoders:
            x = encoder(x)
        memory = x
        x = self.positional_encoding(self.tgt_embed(tgt))
        for decoder in self.decoders:
            x = decoder(x, memory, mask)
        return self.generator(x)
Transform

 

编码器

encoder输入:token embedding和position embedding求和,shape(batch,suqence len,input_dim)

encoder输出:编码后的embedding,shape(batch,suqence len, input_dim)

主要由以下几个部分组成:

  • multi_self_attention(src, src, src):捕捉输入token之间的依赖关系
  • add & norm:残差结构减少梯度消失,并采用LayerNorm归一化,提高训练稳定性
  • FFN:前馈神经网络先对emb升维再降维,捕捉emb内的依赖关系,完成特征交互,提高模型表达能力
class Encoder(nn.Module):
    def __init__(self, d_model, d_ff, multi_attn):
        super(Encoder, self).__init__()
        self.multi_attn = multi_attn
        self.layer_norm = nn.ModuleList([LayerNorm(d_model) for _ in range(2)])
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        x1 = self.multi_attn(x, x, x)
        x2 = self.layer_norm[0](x + x1)
        x3 = self.ffn(x2)
        x4 = self.layer_norm[1](x2 + x3)
        return x4
Encoder

 

解码器

主要由以下几个部分组成:

  • masked_multi_self_attention(tgt, tgt, tgt, mask):捕捉输入tgt token之间的依赖关系,对当前预估及之后的token做了mask,防止信息泄漏,得到tgt_memory
  • cross_multi_self_attention(tgt_memory, memory, memory):用输入tgt token交互表示后的结果从encoder编码后的结果中解码出预估结果
  • add & norm:残差结构减少梯度消失,并采用LayerNorm归一化,提高训练稳定性
  • FFN:前馈神经网络先对emb升维再降维,捕捉emb内的依赖关系,完成特征交互,提高模型表达能力
class Decoder(nn.Module):
    def __init__(self, d_model, d_ff, multi_attn):
        super(Decoder, self).__init__()
        self.mask_multi_attn = copy.deepcopy(multi_attn)
        self.cross_multi_attn = copy.deepcopy(multi_attn)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layer_norm = nn.ModuleList([LayerNorm(d_model) for _ in range(3)])

    def forward(self, x, memory, mask):
        x1 = self.mask_multi_attn(x, x, x, mask)
        x2 = self.layer_norm[0](x + x1)
        x3 = self.cross_multi_attn(x2, memory, memory)
        x4 = self.layer_norm[1](x2 + x3)
        x5 = self.ffn(x4)
        x6 = self.layer_norm[2](x4 + x5)
        return x6
Decoder

 

模型训练

def train_test(d_model=32, d_ff=64, h=8, src_vocab=11, tgt_vocab=11, N=2):
    encoder = Encoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    decoder = Decoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    src_embed = Embeddings(d_model, src_vocab)
    tgt_embed = Embeddings(d_model, tgt_vocab)
    model = Transform(encoder, decoder, src_embed, tgt_embed, d_model, tgt_vocab, N)
    model.train() # 将模型设置为训练模式

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    data_iter = data_gen(11, 3, 10)
    for i, batch in enumerate(data_iter):
        optimizer.zero_grad()  # 梯度清零
        outputs = model.forward(batch.src, batch.tgt, batch.tgt_mask)  # 前向传播
        loss = criterion(outputs.contiguous().view(-1, outputs.size(-1)), batch.tgt_y.contiguous().view(-1))  # 计算损失
        print('loss: ', loss)
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数
train

 

模型预估

def inference_test(d_model=32, d_ff=64, h=8, src_vocab=11, tgt_vocab=11, N=2):
    encoder = Encoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    decoder = Decoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    src_embed = Embeddings(d_model, src_vocab)
    tgt_embed = Embeddings(d_model, tgt_vocab)
    test_model = Transform(encoder, decoder, src_embed, tgt_embed, d_model, tgt_vocab, N)
    test_model.eval() # 将模型设置为评估模式
    
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) # 测试数据
    memory = test_model.encode(src)
    print('memory shape: ', memory.shape)
    tgt = torch.zeros(1, 1).type_as(src)

    # 逐步生成目标序列
    for i in range(9):
        out = test_model.decode(
            memory, tgt, subsequent_mask(tgt.size(1)).type_as(src.data)
        )
        print('out shape: ', out.shape)
        prob = test_model.generator(out[:, -1]) # 根据最后一个位置的embedding生成下一个词的概率分布
        # print('prob: ', prob)
        _, next_word = torch.max(prob, dim=1) # 选择概率最大的词的索引
        next_word = next_word.data[0]
        # print('next_word: ', next_word)
        # 将生成的词添加到目标序列中
        tgt = torch.cat(
            [tgt, torch.tensor([[next_word]], dtype=torch.long)], dim=1
        )

    print("Example Untrained Model Prediction:", tgt)
inference

 

完整代码

import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torch.utils.data import DataLoader

class Transform(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, d_model, tgt_vocab, N):
        super(Transform, self).__init__()
        self.encoders = nn.ModuleList([copy.deepcopy(encoder) for _ in range(N)])
        self.decoders = nn.ModuleList([copy.deepcopy(decoder) for _ in range(N)])
        self.proj = nn.Linear(d_model, tgt_vocab) # 线性投影层,将 d_model 维度的向量映射到词汇表大小的维度
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.d_model = d_model
        
        self.max_len=5000 # 编码最大长度
        pe = torch.zeros(self.max_len, d_model) # 初始化位置编码矩阵
        position = torch.arange(0, self.max_len).unsqueeze(1) # 位置索引
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model) # 计算频率
        )
        pe[:, 0::2] = torch.sin(position * div_term) # 偶数位置使用正弦函数编码
        pe[:, 1::2] = torch.cos(position * div_term) # 奇数位置使用余弦函数编码
        pe = pe.unsqueeze(0) # 增加一个维度,用于批次处理
        self.register_buffer("pe", pe) # 将位置编码矩阵注册为缓冲区,不参与模型的训练

    def positional_encoding(self, x):
        return x + self.pe[:, : x.size(1)].requires_grad_(False)
    
    def encode(self, src):
        x = self.positional_encoding(self.src_embed(src))
        for encoder in self.encoders:
            x = encoder(x)
        return x

    def decode(self, memory, tgt, mask):
        x = self.positional_encoding(self.tgt_embed(tgt))
        for decoder in self.decoders:
            x = decoder(x, memory, mask)
        return x

    def generator(self, x):
        return log_softmax(self.proj(x), dim=-1)

    def forward(self, src, tgt, mask):
        x = self.positional_encoding(self.src_embed(src))
        for encoder in self.encoders:
            x = encoder(x)
        memory = x
        x = self.positional_encoding(self.tgt_embed(tgt))
        for decoder in self.decoders:
            x = decoder(x, memory, mask)
        return self.generator(x)


class Encoder(nn.Module):
    def __init__(self, d_model, d_ff, multi_attn):
        super(Encoder, self).__init__()
        self.multi_attn = multi_attn
        self.layer_norm = nn.ModuleList([LayerNorm(d_model) for _ in range(2)])
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        x1 = self.multi_attn(x, x, x)
        x2 = self.layer_norm[0](x + x1)
        x3 = self.ffn(x2)
        x4 = self.layer_norm[1](x2 + x3)
        return x4


class Decoder(nn.Module):
    def __init__(self, d_model, d_ff, multi_attn):
        super(Decoder, self).__init__()
        self.mask_multi_attn = copy.deepcopy(multi_attn)
        self.cross_multi_attn = copy.deepcopy(multi_attn)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.layer_norm = nn.ModuleList([LayerNorm(d_model) for _ in range(3)])

    def forward(self, x, memory, mask):
        x1 = self.mask_multi_attn(x, x, x, mask)
        x2 = self.layer_norm[0](x + x1)
        x3 = self.cross_multi_attn(x2, memory, memory)
        x4 = self.layer_norm[1](x2 + x3)
        x5 = self.ffn(x4)
        x6 = self.layer_norm[2](x4 + x5)
        return x6


class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2


class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = nn.ModuleList([copy.deepcopy(nn.Linear(d_model, d_model)) for _ in range(4)]) # 复制四个线性层,用于投影查询、键、值和输出
        self.attn = None

    def attention(self, query, key, value, mask=None):
        d_k = query.size(-1)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9) # 如果有掩码,将掩码为 0 的位置的分数设为负无穷
        p_attn = scores.softmax(dim=-1)
        return torch.matmul(p_attn, value), p_attn
    
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 将查询、键、值分别通过线性层投影,并分割成多个头
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 计算注意力
        x, self.attn = self.attention(query, key, value, mask=mask)

        # 将多头的输出拼接起来,并通过最后一个线性层
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        return self.linears[-1](x)

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        # 嵌入层,将词汇表大小的索引映射到 d_model 维度的向量
        self.lut = nn.Embedding(vocab, d_model)
        # 模型的维度
        self.d_model = d_model

    def forward(self, x):
        # 将输入的词索引转换为词向量,并乘以 sqrt(d_model) 进行缩放
        return self.lut(x) * math.sqrt(self.d_model)

# 生成后续掩码的函数,用于防止解码器在预测时看到未来的信息
def subsequent_mask(size):
    # 掩码的形状
    attn_shape = (1, size, size)
    # 生成上三角矩阵,对角线以上元素为 1,以下(包含对角线)为 0
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    # 将上三角矩阵转换为布尔矩阵,对角线以上为 False,以下(包含对角线)为 True
    return subsequent_mask == 0


class Batch:
    """Object for holding a batch of data with mask during training."""

    def __init__(self, src, tgt=None, pad=0): # pad: 序列不够长度的填充标志
        self.src = src
        if tgt is not None:
            self.tgt = tgt[:, :-1] # 解码器输入(右移一位,第一位用 1 填充)
            self.tgt_y = tgt[:, 1:] # 解码器真实标签
            self.tgt_mask = self.make_std_mask(self.tgt, pad) # 解码器掩码标签
            self.ntokens = (self.tgt_y != pad).data.sum()

    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(
            tgt_mask.data
        )
        return tgt_mask

def data_gen(V, batch_size, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.randint(1, V, size=(batch_size, 10))
        data[:, 0] = 1
        src = data.requires_grad_(False).clone().detach()
        tgt = data.requires_grad_(False).clone().detach()
        yield Batch(src, tgt)


def train_test(d_model=32, d_ff=64, h=8, src_vocab=11, tgt_vocab=11, N=2):
    encoder = Encoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    decoder = Decoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    src_embed = Embeddings(d_model, src_vocab)
    tgt_embed = Embeddings(d_model, tgt_vocab)
    model = Transform(encoder, decoder, src_embed, tgt_embed, d_model, tgt_vocab, N)
    model.train() # 将模型设置为训练模式

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    data_iter = data_gen(11, 3, 10)
    for i, batch in enumerate(data_iter):
        optimizer.zero_grad()  # 梯度清零
        outputs = model.forward(batch.src, batch.tgt, batch.tgt_mask)  # 前向传播
        loss = criterion(outputs.contiguous().view(-1, outputs.size(-1)), batch.tgt_y.contiguous().view(-1))  # 计算损失
        print('loss: ', loss)
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数


def inference_test(d_model=32, d_ff=64, h=8, src_vocab=11, tgt_vocab=11, N=2):
    encoder = Encoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    decoder = Decoder(d_model,d_ff,MultiHeadedAttention(h,d_model))
    src_embed = Embeddings(d_model, src_vocab)
    tgt_embed = Embeddings(d_model, tgt_vocab)
    test_model = Transform(encoder, decoder, src_embed, tgt_embed, d_model, tgt_vocab, N)
    test_model.eval() # 将模型设置为评估模式
    
    src = torch.LongTensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]) # 测试数据
    memory = test_model.encode(src)
    print('memory shape: ', memory.shape)
    tgt = torch.zeros(1, 1).type_as(src)

    # 逐步生成目标序列
    for i in range(9):
        out = test_model.decode(
            memory, tgt, subsequent_mask(tgt.size(1)).type_as(src.data)
        )
        print('out shape: ', out.shape)
        prob = test_model.generator(out[:, -1]) # 根据最后一个位置的embedding生成下一个词的概率分布
        # print('prob: ', prob)
        _, next_word = torch.max(prob, dim=1) # 选择概率最大的词的索引
        next_word = next_word.data[0]
        # print('next_word: ', next_word)
        # 将生成的词添加到目标序列中
        tgt = torch.cat(
            [tgt, torch.tensor([[next_word]], dtype=torch.long)], dim=1
        )

    print("Example Untrained Model Prediction:", tgt)


train_test()
inference_test()
View Code

 

 

CNN 和 self-attention区别?

共同点:CNN和self-attention都是一种加权求和的方式

差别:CNN只是简单对一个区域像素做归纳总结,而self-attention是用输入query去和其他embedding计算相似度,CNN和query无关,self-attention和query有关

 

多头attention的优点?

可以从多个维度(子空间)捕获信息,建模能力更强

\[ \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V \]

\[ \text{Head}_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V) \]

\[ \text{MultiHead}(Q, K, V) = \text{Concat}(\text{Head}_1, \text{Head}_2, \cdots, \text{Head}_h)W^O \]

单头attention时间复杂度:O(n2d)

多头attention时间复杂度:O(n2d//h * h) = O(n2d)

单头attention和多头attention时间复杂度基本一样,多头attention多了拼接多头结果过非线性层 

 

参考资料

https://luweikxy.gitbook.io/machine-learning-notes/self-attention-and-transformer#Self-Attention%E6%9C%BA%E5%88%B6

详解Transformer (Attention Is All You Need)

https://www.zhihu.com/question/61077555/answer/2914496700

代码实现:https://nlp.seas.harvard.edu/annotated-transformer/

posted @ 2025-01-21 10:46  AI_Engineer  阅读(22)  评论(0)    收藏  举报