pytorch 实现 Seq2Seq with attention 机器翻译

本文主要介绍了用 pytorch 实现Seq2Seq with attention 机器翻译任务，基于我之前写的 pytorch 实现简单的 Seq2Seq 机器翻译任务。算法理论可以阅读论文“NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND TRANSLATE”，也可以参考我写的论文笔记。

import 需要使用的模块

import torch
import torch.nn as nn
import torch.utils.data as data
import torchsummary
import numpy as np
import sys
import os
import random
import re
import json
from nltk.translate.bleu_score import sentence_bleu

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

设置需要的配置参数

data_path = "./cmn-eng"          # 数据集的位置
store_model_path = "./ckpt"      # 储存模型的位置
max_output_len = 45              # 输出句子的最大长度
batch_size = 64                  # batch_size
emb_dim = 256                    # word embedding向量的维度
hid_dim = 512                    # RNN隐藏状态的维度
n_layers = 4                     # RNN的层数
dropout = 0.5                    # dropout的概率p
learning_rate = 0.001            # 初始化学习率
# teacher_forcing_ratio = 0.5      # 使用正解训练的概率
summary_steps = 12000            # 总训练batch数
kk = np.argmin([np.abs(summary_steps / 2 - x * np.log(x)) for x in range(1, summary_steps)])

这里的参数 kk 是为了使用 schedule sampling 这一 trick 取代 teacher forcing 机制。

数据集的处理和加载已经写在之前的文章中，这里不再赘述。

定义 Encoder，Attention 和 Decoder 类

class Encoder(nn.Module):
    def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.n_layers = n_layers
        self.embedding = nn.Embedding(en_vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hid_dim * 2, hid_dim * 2)

    def forward(self, input):
        # input = [batch size, sequence len, en_vocab_size]
        batch_size = input.shape[0]
        embedding = self.embedding(input)
        outputs, hidden = self.rnn(self.dropout(embedding))
        # outputs = [batch size, sequence len, hid_dim * directions]
        # hidden =  [n_layers * directions, batch size, hid_dim]
        
        # 因为 Encoder 是双向RNN，所以需要对同一层两个方向的 hidden state 进行拼接
        # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim]
        hidden = hidden.view(self.n_layers, 2, batch_size, -1)
        # s = [num_layers, batch size, hid dim * 2]
        s = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
        s = torch.tanh(self.fc(s))
        return outputs, s
    
class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2 + hid_dim * 2, hid_dim * 2, bias=False)
        self.v = nn.Linear(hid_dim * 2, 1, bias = False)
        
    def forward(self, enc_output, s):
        # s = [num_layers, batch_size, hid_dim * 2]
        # enc_output = [batch_size, seq_len, hid_dim * 2]
        batch_size = enc_output.shape[0]
        seq_len = enc_output.shape[1]
        # s_attn = [num_layers, batch_size, seq_len, hid_dim * 2] -> [batch_size, seq_len, hid_dim * 2]
        s_attn = s.unsqueeze(2).repeat(1, 1, seq_len, 1)
        s_attn = torch.mean(s_attn, 0)
        # E = [batch_size, seq_len, hid_dim * 2]
        E = torch.tanh(self.attn(torch.cat((s_attn, enc_output), dim = 2)))
        # attention = [batch_size, seq_len]
        attention = self.v(E).squeeze(2)
        return nn.functional.softmax(attention, dim=1)
        
class Decoder(nn.Module):
    def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, attention):
        super().__init__()
        self.cn_vocab_size = cn_vocab_size
        self.hid_dim = hid_dim * 2 #因为 Encoder 是双向的
        self.n_layers = n_layers
        self.attention = attention
        self.embedding = nn.Embedding(cn_vocab_size, emb_dim)
        self.input_dim = emb_dim
        self.rnn = nn.GRU(self.input_dim + self.hid_dim, self.hid_dim, n_layers, dropout = dropout, batch_first=True)
        self.embedding2vocab1 = nn.Linear(self.hid_dim + self.hid_dim + emb_dim, cn_vocab_size)
        # self.embedding2vocab2 = nn.Linear((self.hid_dim + self.hid_dim + emb_dim) * 2, cn_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, s, enc_output):
        # input = [batch size, cn_vocab_size]
        # s = [num_layers, batch_size, hid_dim * 2]
        # enc_output = [batch_size, seq_len, hid_dim * 2]
        # Decoder 是单向，所以 directions=1
        input = input.unsqueeze(1)
        # embedded = [batch size, 1, emb_dim]
        embedded = self.dropout(self.embedding(input))
        # a = [batch_size, 1, seq_len]
        a = self.attention(enc_output, s).unsqueeze(1)
        # c = [batch_size, 1, hid_dim * 2]
        c = torch.bmm(a, enc_output)
        # rnn_input = [batch_size, 1, emb_dim + hid_dim * 2]
        rnn_input = torch.cat((embedded, c), dim = 2)
        # dec_output = [batch_size, 1, hid_dim * 2]
        # s = [num_layers, batch_size, hid_dim * 2]
        dec_output, s = self.rnn(rnn_input, s)
        
        embedded = embedded.squeeze(1)
        dec_output = dec_output.squeeze(1)
        c = c.squeeze(1)
        
        # 将 RNN 的输出向量的维数转换到target语言的字典大小
        # output = [batch size, vocab size]
        output = self.embedding2vocab1(torch.cat((dec_output, c, embedded), dim = 1))
        # output = self.embedding2vocab2(output)

        return output, s

可以看出，Encoder 是一个双向四层的GRU，Decoder 是一个单向四层的GRU。Encoder 里 RNN 的隐藏状态输出用来初始化 Decoder 里的 RNN，而 Encoder 的输出用来计算 attention ，attention 会在 Decoder 中被用来计算上下文向量 context 。

定义 Seq2Seq 类，定义模型、优化器和损失函数

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    def forward(self, input, target, teacher_forcing_ratio):
        # input  = [batch size, input len, vocab size]
        # target = [batch size, target len, vocab size]
        # teacher_forcing_ratio 是使用正解训练的概率
        batch_size = target.shape[0]
        target_len=target.shape[1]
        vocab_size = self.decoder.cn_vocab_size
        
        # 准备一个tensor存储输出
        outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
        # encoder_outputs用来计算attention，s 用来初始化 Decoder
        encoder_outputs, s = self.encoder(input)
        
        dec_input = target[:, 0] # 'BOS'
        preds = []
        for t in range(1, target_len):
            output, s = self.decoder(dec_input, s, encoder_outputs)
            outputs[:, t] = output
            # 决定是否用正解来训练
            teacher_force = random.random() <= teacher_forcing_ratio
            # 取出输出概率最大的词
            top1 = output.argmax(1)
            # teacher force 为 True 用正解训练，否则用预测到的最大概率的词训练
            dec_input = target[:, t] if teacher_force and t < target_len else top1
            preds.append(top1.unsqueeze(1))
        preds = torch.cat(preds, 1)
        return outputs, preds

    def inference(self, input, target):
        # 测试模型
        # input  = [batch size, input len, vocab size]
        # target = [batch size, target len, vocab size]
        batch_size = input.shape[0]
        input_len = input.shape[1]
        vocab_size = self.decoder.cn_vocab_size
        
        # 准备一个tensor存储输出
        outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device)
        # encoder_outputs用来计算attention，s 用来初始化 Decoder
        encoder_outputs, s = self.encoder(input)
        
        dec_input = target[:, 0] # 'BOS'
        preds = []
        for t in range(1, input_len):
            output, s = self.decoder(dec_input, s, encoder_outputs)
            outputs[:, t] = output
            # 取出输出概率最大的词
            top1 = output.argmax(1)
            # 用预测到的最大概率的词进行下一步预测
            dec_input = top1
            preds.append(top1.unsqueeze(1))
        preds = torch.cat(preds, 1)
        return outputs, preds

encoder = Encoder(en_vocab_size, emb_dim, hid_dim, n_layers, dropout)
attention = Attention(hid_dim)
decoder = Decoder(cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, attention)
model = Seq2Seq(encoder, decoder, device).to(device)
print(model)
loss_function = nn.CrossEntropyLoss(ignore_index=0).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
print(optimizer)
print('num of parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))

可以看出，Decoder 是逐词预测的，训练时使用 schedule sampling 取代一般的 teacher forcing，验证和测试时不使用。损失函数中有一个参数 ignore_index=0 ，表示忽略预测分类为 0 时的 loss，因为此处分类为 0 时表示预测词为 '<PAD>' 。

定义一些有用的函数，用来存储、读取网络参数，计算 bleu score，将预测结果转为文字，创建训练 batch 的迭代器，计算 schedule sampling 的值

def save_model(model, optimizer, store_model_path, step):
    torch.save(model.state_dict(), '{}/model_attention_{}.ckpt'.format(store_model_path,step))
    return

def load_model(model, load_model_path):
    print('Load model from {}'.format(load_model_path))
    model.load_state_dict(torch.load('{}.ckpt'.format(load_model_path)))
    return model

def computebleu(sentences, targets):
    score = 0 
    assert (len(sentences) == len(targets))

    def cut_token(sentence):
        tmp = []
        for token in sentence:
            if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1:
                tmp.append(token)
            else:
                tmp += [word for word in token]
        return tmp 

    for sentence, target in zip(sentences, targets):
        sentence = cut_token(sentence)
        target = cut_token(target)
        score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0))
    return score

def tokens2sentence(outputs, int2word):
    sentences = []
    for tokens in outputs:
        sentence = []
        for token in tokens:
            word = int2word[str(int(token))]
            if word == '<EOS>':
                break
            sentence.append(word)
        sentences.append(sentence)
    return sentences

def infinite_iter(data_loader):
    it = iter(data_loader)
    while True:
        try:
            ret = next(it)
            yield ret
        except StopIteration:
            it = iter(data_loader)
            
def schedule_sampling(step, summary_steps, c, k):
    if c == 0:
        # Inverse sigmoid decay: ϵi = k/(k+exp(i/k))
        # k = np.argmin([np.abs(summary_steps / 2 - x * np.log(x)) for x in range(1, summary_steps)])
        e = k / (k + np.exp(step/k))
    elif c == 1:
        # Linear decay: ϵi = -1/k * i + 1
        e = -1 / summary_steps * step + 1
    elif c == 2:
        # Exponential decay: ϵi = k^i
        e = np.power(0.999, step)
    return e

训练与验证，每600轮 batch 训练进行验证，并存储模型

model.train()
model.zero_grad()
train_losses, val_losses, val_bleu_scores = [], [], []
loss_sum = 0.0
train_iter = infinite_iter(train_loader)

for step in range(summary_steps):
    model.train()
    sources, targets = next(train_iter)
    sources, targets = sources.to(device), targets.to(device)
    outputs, preds = model(sources, targets, schedule_sampling(step, summary_steps, c = 0, k = kk))
    # targets 的第一个 token 是 '<BOS>' 所以忽略
    outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
    targets = targets[:, 1:].reshape(-1)
    loss = loss_function(outputs, targets)
    
    optimizer.zero_grad()
    loss.backward()
    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    optimizer.step()

    loss_sum += loss.item()
    if (step + 1) % 10 == 0:
        loss_sum = loss_sum / 10
        print ("\r","train [{}] loss: {:.3f}, Perplexity: {:.3f}".format(step+1, loss_sum, np.exp(loss_sum)), end=" ")
        train_losses.append(loss_sum)
        loss_sum = 0.0
        
        if (step + 1) % 600 == 0:
            # 每600轮 batch 训练进行验证，并存储模型
            model.eval()
            loss_val, bleu_val= 0.0, 0.0
            n = 0
            for sources_val, targets_val in val_loader:
                sources_val, targets_val = sources_val.to(device), targets_val.to(device)
                batch_size = sources_val.size(0)
                #print(batch_size)
                outputs_val, preds_val = model.inference(sources_val, targets_val)
                # targets 的第一个 token 是 '<BOS>' 所以忽略
                outputs_val = outputs_val[:, 1:].reshape(-1, outputs_val.size(2))
                targets_val = targets_val[:, 1:].reshape(-1)
                loss = loss_function(outputs_val, targets_val)
                loss_val += loss.item()
                
                # 将预测结果转为文字
                targets_val = targets_val.view(sources_val.size(0), -1)
                preds_val = tokens2sentence(preds_val, int2word_cn)
                sources_val = tokens2sentence(sources_val, int2word_en)
                targets_val = tokens2sentence(targets_val, int2word_cn)
                # 计算 Bleu Score
                bleu_val += computebleu(preds_val, targets_val)
                n += batch_size
            loss_val = loss_val/len(val_loader)
            bleu_val = bleu_val/n
            val_losses.append(loss_val)
            val_bleu_scores.append(bleu_val)
            print ("\n", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, bleu score: {:.3f} ".format(step+1, loss_val, np.exp(loss_val), bleu_val))

            # 储存模型
            save_model(model, optimizer, store_model_path, step+1)

测试并保存结果

load_model_path = "./ckpt/model_attention_12000"      # 读取模型位置

model = load_model(model, load_model_path) # 读取模型
model.to(device)
model.eval()
# 测试模型
loss_test, bleu_test= 0.0, 0.0
n = 0
result = []
for sources_test, targets_test in test_loader:
    sources_test, targets_test = sources_test.to(device), targets_test.to(device)
    batch_size = sources_test.size(0)
    # print(batch_size)
    outputs_test, preds_test = model.inference(sources_test, targets_test)
    # targets 的第一个 token 是 '<BOS>' 所以忽略
    outputs_test = outputs_test[:, 1:].reshape(-1, outputs_test.size(2))
    targets_test = targets_test[:, 1:].reshape(-1)
    loss = loss_function(outputs_test, targets_test)
    loss_test += loss.item()
                
    # 将预测结果转为文字
    targets_test = targets_test.view(sources_test.size(0), -1)
    preds_test = tokens2sentence(preds_test, int2word_cn)
    sources_test = tokens2sentence(sources_test, int2word_en)
    targets_test = tokens2sentence(targets_test, int2word_cn)
    for source, pred, target in zip(sources_test, preds_test, targets_test):
        result.append((source, pred, target))
    # 计算 Bleu Score
    bleu_test += computebleu(preds_test, targets_test)
    n += batch_size
loss_test = loss_test/len(test_loader)
bleu_test = bleu_test/n
print ('test loss: {}, bleu_score: {}'.format(loss_test,bleu_test))
# 储存结果
with open('./test_attention_output.txt', 'w') as f:
    for line in result:
        print (line, file=f)

posted @ 2020-08-24 16:25 滑天下之大j 阅读(792) 评论(0) 收藏举报

刷新页面返回顶部

pytorch 实现 Seq2Seq with attention 机器翻译

公告