pytorch 实现 Seq2Seq with attention 机器翻译
本文主要介绍了用 pytorch 实现Seq2Seq with attention 机器翻译任务,基于我之前写的 pytorch 实现简单的 Seq2Seq 机器翻译任务 。算法理论可以阅读论文“NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND TRANSLATE”,也可以参考 我写的论文笔记 。
import 需要使用的模块
import torch import torch.nn as nn import torch.utils.data as data import torchsummary import numpy as np import sys import os import random import re import json from nltk.translate.bleu_score import sentence_bleu device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
设置需要的配置参数
data_path = "./cmn-eng" # 数据集的位置 store_model_path = "./ckpt" # 储存模型的位置 max_output_len = 45 # 输出句子的最大长度 batch_size = 64 # batch_size emb_dim = 256 # word embedding向量的维度 hid_dim = 512 # RNN隐藏状态的维度 n_layers = 4 # RNN的层数 dropout = 0.5 # dropout的概率p learning_rate = 0.001 # 初始化学习率 # teacher_forcing_ratio = 0.5 # 使用正解训练的概率 summary_steps = 12000 # 总训练batch数 kk = np.argmin([np.abs(summary_steps / 2 - x * np.log(x)) for x in range(1, summary_steps)])
这里的参数 kk 是为了使用 schedule sampling 这一 trick 取代 teacher forcing 机制。
数据集的处理和加载已经写在之前的文章中,这里不再赘述。
定义 Encoder,Attention 和 Decoder 类
class Encoder(nn.Module): def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout): super().__init__() self.n_layers = n_layers self.embedding = nn.Embedding(en_vocab_size, emb_dim) self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first=True, bidirectional=True) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(hid_dim * 2, hid_dim * 2) def forward(self, input): # input = [batch size, sequence len, en_vocab_size] batch_size = input.shape[0] embedding = self.embedding(input) outputs, hidden = self.rnn(self.dropout(embedding)) # outputs = [batch size, sequence len, hid_dim * directions] # hidden = [n_layers * directions, batch size, hid_dim] # 因为 Encoder 是双向RNN,所以需要对同一层两个方向的 hidden state 进行拼接 # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim] hidden = hidden.view(self.n_layers, 2, batch_size, -1) # s = [num_layers, batch size, hid dim * 2] s = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2) s = torch.tanh(self.fc(s)) return outputs, s class Attention(nn.Module): def __init__(self, hid_dim): super().__init__() self.attn = nn.Linear(hid_dim * 2 + hid_dim * 2, hid_dim * 2, bias=False) self.v = nn.Linear(hid_dim * 2, 1, bias = False) def forward(self, enc_output, s): # s = [num_layers, batch_size, hid_dim * 2] # enc_output = [batch_size, seq_len, hid_dim * 2] batch_size = enc_output.shape[0] seq_len = enc_output.shape[1] # s_attn = [num_layers, batch_size, seq_len, hid_dim * 2] -> [batch_size, seq_len, hid_dim * 2] s_attn = s.unsqueeze(2).repeat(1, 1, seq_len, 1) s_attn = torch.mean(s_attn, 0) # E = [batch_size, seq_len, hid_dim * 2] E = torch.tanh(self.attn(torch.cat((s_attn, enc_output), dim = 2))) # attention = [batch_size, seq_len] attention = self.v(E).squeeze(2) return nn.functional.softmax(attention, dim=1) class Decoder(nn.Module): def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, attention): super().__init__() self.cn_vocab_size = cn_vocab_size self.hid_dim = hid_dim * 2 #因为 Encoder 是双向的 self.n_layers = n_layers self.attention = attention self.embedding = nn.Embedding(cn_vocab_size, emb_dim) self.input_dim = emb_dim self.rnn = nn.GRU(self.input_dim + self.hid_dim, self.hid_dim, n_layers, dropout = dropout, batch_first=True) self.embedding2vocab1 = nn.Linear(self.hid_dim + self.hid_dim + emb_dim, cn_vocab_size) # self.embedding2vocab2 = nn.Linear((self.hid_dim + self.hid_dim + emb_dim) * 2, cn_vocab_size) self.dropout = nn.Dropout(dropout) def forward(self, input, s, enc_output): # input = [batch size, cn_vocab_size] # s = [num_layers, batch_size, hid_dim * 2] # enc_output = [batch_size, seq_len, hid_dim * 2] # Decoder 是单向,所以 directions=1 input = input.unsqueeze(1) # embedded = [batch size, 1, emb_dim] embedded = self.dropout(self.embedding(input)) # a = [batch_size, 1, seq_len] a = self.attention(enc_output, s).unsqueeze(1) # c = [batch_size, 1, hid_dim * 2] c = torch.bmm(a, enc_output) # rnn_input = [batch_size, 1, emb_dim + hid_dim * 2] rnn_input = torch.cat((embedded, c), dim = 2) # dec_output = [batch_size, 1, hid_dim * 2] # s = [num_layers, batch_size, hid_dim * 2] dec_output, s = self.rnn(rnn_input, s) embedded = embedded.squeeze(1) dec_output = dec_output.squeeze(1) c = c.squeeze(1) # 将 RNN 的输出向量的维数转换到target语言的字典大小 # output = [batch size, vocab size] output = self.embedding2vocab1(torch.cat((dec_output, c, embedded), dim = 1)) # output = self.embedding2vocab2(output) return output, s
可以看出,Encoder 是一个双向四层的GRU,Decoder 是一个单向四层的GRU。Encoder 里 RNN 的隐藏状态输出用来初始化 Decoder 里的 RNN,而 Encoder 的输出用来计算 attention ,attention 会在 Decoder 中被用来计算上下文向量 context 。
定义 Seq2Seq 类,定义模型、优化器和损失函数
class Seq2Seq(nn.Module): def __init__(self, encoder, decoder, device): super().__init__() self.encoder = encoder self.decoder = decoder self.device = device def forward(self, input, target, teacher_forcing_ratio): # input = [batch size, input len, vocab size] # target = [batch size, target len, vocab size] # teacher_forcing_ratio 是使用正解训练的概率 batch_size = target.shape[0] target_len=target.shape[1] vocab_size = self.decoder.cn_vocab_size # 准备一个tensor存储输出 outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device) # encoder_outputs用来计算attention,s 用来初始化 Decoder encoder_outputs, s = self.encoder(input) dec_input = target[:, 0] # 'BOS' preds = [] for t in range(1, target_len): output, s = self.decoder(dec_input, s, encoder_outputs) outputs[:, t] = output # 决定是否用正解来训练 teacher_force = random.random() <= teacher_forcing_ratio # 取出输出概率最大的词 top1 = output.argmax(1) # teacher force 为 True 用正解训练,否则用预测到的最大概率的词训练 dec_input = target[:, t] if teacher_force and t < target_len else top1 preds.append(top1.unsqueeze(1)) preds = torch.cat(preds, 1) return outputs, preds def inference(self, input, target): # 测试模型 # input = [batch size, input len, vocab size] # target = [batch size, target len, vocab size] batch_size = input.shape[0] input_len = input.shape[1] vocab_size = self.decoder.cn_vocab_size # 准备一个tensor存储输出 outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device) # encoder_outputs用来计算attention,s 用来初始化 Decoder encoder_outputs, s = self.encoder(input) dec_input = target[:, 0] # 'BOS' preds = [] for t in range(1, input_len): output, s = self.decoder(dec_input, s, encoder_outputs) outputs[:, t] = output # 取出输出概率最大的词 top1 = output.argmax(1) # 用预测到的最大概率的词进行下一步预测 dec_input = top1 preds.append(top1.unsqueeze(1)) preds = torch.cat(preds, 1) return outputs, preds
encoder = Encoder(en_vocab_size, emb_dim, hid_dim, n_layers, dropout) attention = Attention(hid_dim) decoder = Decoder(cn_vocab_size, emb_dim, hid_dim, n_layers, dropout, attention) model = Seq2Seq(encoder, decoder, device).to(device) print(model) loss_function = nn.CrossEntropyLoss(ignore_index=0).to(device) optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) print(optimizer) print('num of parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
可以看出,Decoder 是逐词预测的,训练时使用 schedule sampling 取代一般的 teacher forcing,验证和测试时不使用。损失函数中有一个参数 ignore_index=0 ,表示忽略预测分类为 0 时的 loss,因为此处分类为 0 时表示预测词为 '<PAD>' 。
定义一些有用的函数,用来存储、读取网络参数,计算 bleu score,将预测结果转为文字,创建训练 batch 的迭代器,计算 schedule sampling 的值
def save_model(model, optimizer, store_model_path, step): torch.save(model.state_dict(), '{}/model_attention_{}.ckpt'.format(store_model_path,step)) return def load_model(model, load_model_path): print('Load model from {}'.format(load_model_path)) model.load_state_dict(torch.load('{}.ckpt'.format(load_model_path))) return model def computebleu(sentences, targets): score = 0 assert (len(sentences) == len(targets)) def cut_token(sentence): tmp = [] for token in sentence: if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1: tmp.append(token) else: tmp += [word for word in token] return tmp for sentence, target in zip(sentences, targets): sentence = cut_token(sentence) target = cut_token(target) score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0)) return score def tokens2sentence(outputs, int2word): sentences = [] for tokens in outputs: sentence = [] for token in tokens: word = int2word[str(int(token))] if word == '<EOS>': break sentence.append(word) sentences.append(sentence) return sentences def infinite_iter(data_loader): it = iter(data_loader) while True: try: ret = next(it) yield ret except StopIteration: it = iter(data_loader) def schedule_sampling(step, summary_steps, c, k): if c == 0: # Inverse sigmoid decay: ϵi = k/(k+exp(i/k)) # k = np.argmin([np.abs(summary_steps / 2 - x * np.log(x)) for x in range(1, summary_steps)]) e = k / (k + np.exp(step/k)) elif c == 1: # Linear decay: ϵi = -1/k * i + 1 e = -1 / summary_steps * step + 1 elif c == 2: # Exponential decay: ϵi = k^i e = np.power(0.999, step) return e
训练与验证,每600轮 batch 训练进行验证,并存储模型
model.train() model.zero_grad() train_losses, val_losses, val_bleu_scores = [], [], [] loss_sum = 0.0 train_iter = infinite_iter(train_loader) for step in range(summary_steps): model.train() sources, targets = next(train_iter) sources, targets = sources.to(device), targets.to(device) outputs, preds = model(sources, targets, schedule_sampling(step, summary_steps, c = 0, k = kk)) # targets 的第一个 token 是 '<BOS>' 所以忽略 outputs = outputs[:, 1:].reshape(-1, outputs.size(2)) targets = targets[:, 1:].reshape(-1) loss = loss_function(outputs, targets) optimizer.zero_grad() loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() loss_sum += loss.item() if (step + 1) % 10 == 0: loss_sum = loss_sum / 10 print ("\r","train [{}] loss: {:.3f}, Perplexity: {:.3f}".format(step+1, loss_sum, np.exp(loss_sum)), end=" ") train_losses.append(loss_sum) loss_sum = 0.0 if (step + 1) % 600 == 0: # 每600轮 batch 训练进行验证,并存储模型 model.eval() loss_val, bleu_val= 0.0, 0.0 n = 0 for sources_val, targets_val in val_loader: sources_val, targets_val = sources_val.to(device), targets_val.to(device) batch_size = sources_val.size(0) #print(batch_size) outputs_val, preds_val = model.inference(sources_val, targets_val) # targets 的第一个 token 是 '<BOS>' 所以忽略 outputs_val = outputs_val[:, 1:].reshape(-1, outputs_val.size(2)) targets_val = targets_val[:, 1:].reshape(-1) loss = loss_function(outputs_val, targets_val) loss_val += loss.item() # 将预测结果转为文字 targets_val = targets_val.view(sources_val.size(0), -1) preds_val = tokens2sentence(preds_val, int2word_cn) sources_val = tokens2sentence(sources_val, int2word_en) targets_val = tokens2sentence(targets_val, int2word_cn) # 计算 Bleu Score bleu_val += computebleu(preds_val, targets_val) n += batch_size loss_val = loss_val/len(val_loader) bleu_val = bleu_val/n val_losses.append(loss_val) val_bleu_scores.append(bleu_val) print ("\n", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, bleu score: {:.3f} ".format(step+1, loss_val, np.exp(loss_val), bleu_val)) # 储存模型 save_model(model, optimizer, store_model_path, step+1)
测试并保存结果
load_model_path = "./ckpt/model_attention_12000" # 读取模型位置 model = load_model(model, load_model_path) # 读取模型 model.to(device) model.eval() # 测试模型 loss_test, bleu_test= 0.0, 0.0 n = 0 result = [] for sources_test, targets_test in test_loader: sources_test, targets_test = sources_test.to(device), targets_test.to(device) batch_size = sources_test.size(0) # print(batch_size) outputs_test, preds_test = model.inference(sources_test, targets_test) # targets 的第一个 token 是 '<BOS>' 所以忽略 outputs_test = outputs_test[:, 1:].reshape(-1, outputs_test.size(2)) targets_test = targets_test[:, 1:].reshape(-1) loss = loss_function(outputs_test, targets_test) loss_test += loss.item() # 将预测结果转为文字 targets_test = targets_test.view(sources_test.size(0), -1) preds_test = tokens2sentence(preds_test, int2word_cn) sources_test = tokens2sentence(sources_test, int2word_en) targets_test = tokens2sentence(targets_test, int2word_cn) for source, pred, target in zip(sources_test, preds_test, targets_test): result.append((source, pred, target)) # 计算 Bleu Score bleu_test += computebleu(preds_test, targets_test) n += batch_size loss_test = loss_test/len(test_loader) bleu_test = bleu_test/n print ('test loss: {}, bleu_score: {}'.format(loss_test,bleu_test)) # 储存结果 with open('./test_attention_output.txt', 'w') as f: for line in result: print (line, file=f)
浙公网安备 33010602011771号