pytorch 实现 Seq2Seq 机器翻译
本文主要介绍了用 pytorch 实现简单的 Seq2Seq 机器翻译任务,参考了 李宏毅老师的深度学习课程 的作业八,数据集也来自于此,视频课程可以在B站学习(https://www.bilibili.com/video/BV1JE411g7XF?p=53)。算法理论可以阅读论文"Sequence to Sequence Learning with Neural Networks",也可以参考 我写的论文阅读笔记 。
import 需要使用的模块
1 import torch 2 import torch.nn as nn 3 import torch.utils.data as data 4 import torchsummary 5 from torchvision import datasets 6 import numpy as np 7 import sys 8 import os 9 import random 10 import re 11 import json 12 from nltk.translate.bleu_score import sentence_bleu 13 14 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
设置需要的配置参数
1 data_path = "./cmn-eng" # 数据集的位置 2 store_model_path = "./ckpt" # 储存模型的位置 3 max_output_len = 45 # 输出句子的最大长度 4 batch_size = 64 # batch_size 5 emb_dim = 256 # word embedding向量的维度 6 hid_dim = 512 # RNN隐藏状态的维度 7 n_layers = 4 # RNN的层数 8 dropout = 0.5 # dropout的概率p 9 learning_rate = 0.0001 # 初始化学习率 10 teacher_forcing_ratio = 0.5 # 使用正解训练的概率 11 summary_steps = 6000 # 总训练batch数
从下载的数据文件中导入数据集和中英文字典
1 # 载入字典 2 def get_dictionary(root, language): 3 with open(os.path.join(root, 'word2int_{}.json'.format(language)), "r") as f: 4 word2int = json.load(f) 5 with open(os.path.join(root, 'int2word_{}.json'.format(language)), "r") as f: 6 int2word = json.load(f) 7 print('{} vocab size: {}'.format(language, len(word2int))) 8 return word2int, int2word, len(word2int) 9 10 word2int_cn, int2word_cn, cn_vocab_size = get_dictionary(data_path, 'cn') # 中文字典 11 word2int_en, int2word_en, en_vocab_size = get_dictionary(data_path, 'en') # 英文字典 12 vocab=[word2int_cn, int2word_cn, word2int_en, int2word_en] 13 14 # 载入数据 (training/validation/testing) 15 def load_data(root, set_name): 16 data = [] 17 with open(os.path.join(root, '{}.txt'.format(set_name)), "r") as f: 18 for line in f: 19 data.append(line) 20 print('{} dataset size: {}'.format(set_name,len(data))) 21 22 return data 23 24 training_data=load_data(data_path, 'training') 25 val_data=load_data(data_path, 'validation') 26 testing_data=load_data(data_path, 'testing') 27 28 ''' 29 打印结果 30 cn vocab size: 3805 31 en vocab size: 3922 32 training dataset size: 18000 33 validation dataset size: 500 34 testing dataset size: 2636 35 '''
从输出结果可以看到,中英文字典中的词汇量分别为3805、3922,训练集、验证集、测试集中的句子数量分别为18000、500、2636。
对数据集进行一些处理,也就是继承 torch.utils.data.Dataset 类,然后实现里面的 __len__ 以及 __getitem__ 方法
1 class EN2CNDataset(data.Dataset): 2 def __init__(self, data, max_output_len, vocab): 3 self.max_output_len = max_output_len 4 self.word2int_cn, self.int2word_cn = vocab[0], vocab[1] # 中文字典 5 self.word2int_en, self.int2word_en = vocab[2], vocab[3] # 英文字典 6 self.data = data 7 8 self.cn_vocab_size = len(self.word2int_cn) 9 self.en_vocab_size = len(self.word2int_en) 10 11 def seq_pad(self, label, pad_token): 12 # 将不同长度的句子pad到相同长度,以便训练 13 label = np.pad(label, (0, (self.max_output_len - label.shape[0])), mode='constant', constant_values=pad_token) 14 return label 15 16 def __len__(self): 17 return len(self.data) 18 19 def __getitem__(self, Index): 20 # 将英文句子和中文句子分开 21 sentences = self.data[Index] 22 sentences = re.split('[\t\n]', sentences) 23 sentences = list(filter(None, sentences)) 24 #print (sentences) 25 assert len(sentences) == 2 26 27 # 特殊token 28 BOS = self.word2int_en['<BOS>'] 29 EOS = self.word2int_en['<EOS>'] 30 UNK = self.word2int_en['<UNK>'] 31 32 # 在句子开头添加'<BOS>',结尾添加'<EOS>',字典中没有的词标记为'<UNK>' 33 en, cn = [BOS], [BOS] 34 # 英文句子分词后转为字典索引向量 35 sentence = re.split(' ', sentences[0]) 36 sentence = list(filter(None, sentence)) 37 for word in sentence: 38 en.append(self.word2int_en.get(word, UNK)) 39 en.append(EOS) 40 41 # 中文句子分词后转为字典索引向量 42 # e.g. < BOS >, we, are, friends, < EOS > --> 1, 28, 29, 205, 2 43 sentence = re.split(' ', sentences[1]) 44 sentence = list(filter(None, sentence)) 45 for word in sentence: 46 cn.append(self.word2int_cn.get(word, UNK)) 47 cn.append(EOS) 48 49 en, cn = np.asarray(en), np.asarray(cn) 50 #if len(en)>30 or len(cn)>30: 51 # print(len(en),len(cn)) 52 53 # 用 '<PAD>' 将句子pad到相同长度 54 en = self.seq_pad(en, self.word2int_en['<PAD>']) 55 cn = self.seq_pad(cn, self.word2int_cn['<PAD>']) 56 en, cn = torch.LongTensor(en), torch.LongTensor(cn) 57 58 # return英文和中文句子的向量 59 return en, cn 60 61 train_dataset = EN2CNDataset(training_data, max_output_len, vocab) 62 val_dataset = EN2CNDataset(val_data, max_output_len, vocab) 63 test_dataset = EN2CNDataset(testing_data, max_output_len, vocab) 64 train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 65 val_loader = data.DataLoader(val_dataset, batch_size=1) 66 test_loader = data.DataLoader(test_dataset, batch_size=1)
主要处理为:在每个句子的首尾加上开始和结束符号 ('<BOS>' & '<EOS>') ;用 '<UNK>' 符号代替句子中没在字典中出现的词,表示生词;将每个句子用 '<PAD>' 符号补充到相同长度 max_output_len ;将句子中的词转换为字典中的index,以index序列的形式存储;中英文序列分开存储。
定义 Encoder 和 Decoder 类
1 class Encoder(nn.Module): 2 def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout): 3 super().__init__() 4 self.hid_dim = hid_dim 5 self.n_layers = n_layers 6 self.embedding = nn.Embedding(en_vocab_size, emb_dim) 7 self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first=True, bidirectional=True) 8 self.dropout = nn.Dropout(dropout) 9 10 def forward(self, input): 11 # input = [batch size, sequence len, en_vocab_size] 12 embedding = self.embedding(input) 13 outputs, hidden = self.rnn(self.dropout(embedding)) 14 # outputs = [batch size, sequence len, hid_dim * directions] 15 # hidden = [n_layers * directions, batch size , hid_dim] 16 17 return outputs, hidden 18 19 class Decoder(nn.Module): 20 def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout): 21 super().__init__() 22 self.cn_vocab_size = cn_vocab_size 23 self.hid_dim = hid_dim * 2 #因为 Encoder 是双向的 24 self.n_layers = n_layers 25 self.embedding = nn.Embedding(cn_vocab_size, emb_dim) 26 self.input_dim = emb_dim 27 self.rnn = nn.GRU(self.input_dim, self.hid_dim, n_layers, dropout = dropout, batch_first=True) 28 self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2) 29 self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4) 30 self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size) 31 self.dropout = nn.Dropout(dropout) 32 33 def forward(self, input, hidden): 34 # input = [batch size, cn_vocab_size] 35 # hidden = [batch size, n_layers * directions, hid_dim] 36 # Decoder 是单向,所以 directions=1 37 input = input.unsqueeze(1) 38 embedded = self.dropout(self.embedding(input)) 39 # embedded = [batch size, 1, emb_dim] 40 output, hidden = self.rnn(embedded, hidden) 41 # output = [batch size, 1, hid_dim] 42 # hidden = [n_layers, batch size, hid_dim] 43 44 # 将 RNN 的输出向量的维数转换到target语言的字典大小 45 output = self.embedding2vocab1(output.squeeze(1)) 46 output = self.embedding2vocab2(output) 47 prediction = self.embedding2vocab3(output) 48 # prediction = [batch size, vocab size] 49 return prediction, hidden
可以看出,Encoder 是一个双向四层的GRU,Decoder 是一个单向四层的GRU。Encoder 里 RNN 的隐藏状态输出用来初始化 Decoder 里的 RNN,Decoder 的输出用来计算损失值 loss 。
定义 Seq2Seq 类,定义模型、优化器和损失函数
1 class Seq2Seq(nn.Module): 2 def __init__(self, encoder, decoder, device): 3 super().__init__() 4 self.encoder = encoder 5 self.decoder = decoder 6 self.device = device 7 def forward(self, input, target, teacher_forcing_ratio): 8 # input = [batch size, input len, vocab size] 9 # target = [batch size, target len, vocab size] 10 # teacher_forcing_ratio 是使用正解训练的概率 11 batch_size = target.shape[0] 12 target_len=target.shape[1] 13 vocab_size = self.decoder.cn_vocab_size 14 15 # 准备一个tensor存储输出 16 outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device) 17 18 encoder_outputs, hidden = self.encoder(input) 19 # Encoder 最后的隐藏状态(hidden state)用来初始化 Decoder 20 # 因为 Encoder 是双向RNN,所以需要对同一层两个方向的 hidden state 进行拼接 21 # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim] 22 hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1) 23 # hidden = [num_layers, batch size, hid dim * 2] 24 hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2) 25 26 dec_input = target[:, 0] # 'BOS' 27 preds = [] 28 for t in range(1, target_len): 29 output, hidden = self.decoder(dec_input, hidden) 30 outputs[:, t] = output 31 # 决定是否用正解来训练 32 teacher_force = random.random() <= teacher_forcing_ratio 33 # 取出输出概率最大的词 34 top1 = output.argmax(1) 35 # teacher force 为 True 用正解训练,否则用预测到的最大概率的词训练 36 dec_input = target[:, t] if teacher_force and t < target_len else top1 37 preds.append(top1.unsqueeze(1)) 38 preds = torch.cat(preds, 1) 39 return outputs, preds 40 def inference(self, input, target): 41 # 测试模型 42 # input = [batch size, input len, vocab size] 43 # target = [batch size, target len, vocab size] 44 batch_size = input.shape[0] 45 input_len = input.shape[1] 46 vocab_size = self.decoder.cn_vocab_size 47 48 outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device) 49 encoder_outputs, hidden = self.encoder(input) 50 # Encoder 最后的隐藏状态(hidden state)用来初始化 Decoder 51 # 因为 Encoder 是双向RNN,所以需要对同一层两个方向的 hidden state 进行拼接 52 # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim] 53 hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1) 54 hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2) 55 56 dec_input = target[:, 0] # 'BOS' 57 preds = [] 58 for t in range(1, input_len): 59 output, hidden = self.decoder(dec_input, hidden) 60 outputs[:, t] = output 61 # 取出输出概率最大的词 62 top1 = output.argmax(1) 63 # 用预测到的最大概率的词训练 64 dec_input = top1 65 preds.append(top1.unsqueeze(1)) 66 preds = torch.cat(preds, 1) 67 return outputs, preds 68 69 encoder = Encoder(en_vocab_size, emb_dim, hid_dim, n_layers, dropout) 70 decoder = Decoder(cn_vocab_size, emb_dim, hid_dim, n_layers, dropout) 71 model = Seq2Seq(encoder, decoder, device).to(device) 72 print(model) 73 loss_function = nn.CrossEntropyLoss(ignore_index=0).to(device) 74 optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate) 75 print(optimizer) 76 print('num of parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
可以看出,Decoder 是逐词预测的,训练时使用 teacher_forcing 机制,验证和测试时不使用。损失函数中有一个参数 ignore_index=0 ,表示忽略预测分类为 0 时的 loss,因为此处分类为 0 时表示预测词为 '<PAD>' 。
定义一些有用的函数,用来存储、读取网络参数,计算 bleu score,将预测结果转为文字,创建训练 batch 的迭代器
1 def save_model(model, optimizer, store_model_path, step): 2 torch.save(model.state_dict(), '{}/model_{}.ckpt'.format(store_model_path,step)) 3 return 4 5 def load_model(model, load_model_path): 6 print('Load model from {}'.format(load_model_path)) 7 model.load_state_dict(torch.load('{}.ckpt'.format(load_model_path))) 8 return model 9 10 def computebleu(sentences, targets): 11 score = 0 12 assert (len(sentences) == len(targets)) 13 14 def cut_token(sentence): 15 tmp = [] 16 for token in sentence: 17 if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1: 18 tmp.append(token) 19 else: 20 tmp += [word for word in token] 21 return tmp 22 23 for sentence, target in zip(sentences, targets): 24 sentence = cut_token(sentence) 25 target = cut_token(target) 26 score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0)) 27 return score 28 29 def tokens2sentence(outputs, int2word): 30 sentences = [] 31 for tokens in outputs: 32 sentence = [] 33 for token in tokens: 34 word = int2word[str(int(token))] 35 if word == '<EOS>': 36 break 37 sentence.append(word) 38 sentences.append(sentence) 39 return sentences 40 41 def infinite_iter(data_loader): 42 it = iter(data_loader) 43 while True: 44 try: 45 ret = next(it) 46 yield ret 47 except StopIteration: 48 it = iter(data_loader)
训练与验证,每300轮 batch 训练进行验证,并存储模型
1 model.train() 2 model.zero_grad() 3 train_losses, val_losses, val_bleu_scores = [], [], [] 4 loss_sum = 0.0 5 train_iter = infinite_iter(train_loader) 6 7 for step in range(summary_steps): 8 model.train() 9 sources, targets = next(train_iter) 10 sources, targets = sources.to(device), targets.to(device) 11 outputs, preds = model(sources, targets, teacher_forcing_ratio) 12 # targets 的第一个 token 是 '<BOS>' 所以忽略 13 outputs = outputs[:, 1:].reshape(-1, outputs.size(2)) 14 targets = targets[:, 1:].reshape(-1) 15 loss = loss_function(outputs, targets) 16 17 optimizer.zero_grad() 18 loss.backward() 19 grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1) 20 optimizer.step() 21 22 loss_sum += loss.item() 23 if (step + 1) % 10 == 0: 24 loss_sum = loss_sum / 10 25 print ("\r","train [{}] loss: {:.3f}, Perplexity: {:.3f}".format(step+1, loss_sum, np.exp(loss_sum)), end=" ") 26 train_losses.append(loss_sum) 27 loss_sum = 0.0 28 29 if (step + 1) % 300 == 0: 30 # 每300轮 batch 训练进行验证,并存储模型 31 model.eval() 32 loss_val, bleu_val= 0.0, 0.0 33 n = 0 34 for sources_val, targets_val in val_loader: 35 sources_val, targets_val = sources_val.to(device), targets_val.to(device) 36 batch_size = sources_val.size(0) 37 #print(batch_size) 38 outputs_val, preds_val = model.inference(sources_val, targets_val) 39 # targets 的第一个 token 是 '<BOS>' 所以忽略 40 outputs_val = outputs_val[:, 1:].reshape(-1, outputs_val.size(2)) 41 targets_val = targets_val[:, 1:].reshape(-1) 42 loss = loss_function(outputs_val, targets_val) 43 loss_val += loss.item() 44 45 # 将预测结果转为文字 46 targets_val = targets_val.view(sources_val.size(0), -1) 47 preds_val = tokens2sentence(preds_val, int2word_cn) 48 sources_val = tokens2sentence(sources_val, int2word_en) 49 targets_val = tokens2sentence(targets_val, int2word_cn) 50 # 计算 Bleu Score 51 bleu_val += computebleu(preds_val, targets_val) 52 n += batch_size 53 loss_val = loss_val/len(val_loader) 54 bleu_val = bleu_val/n 55 val_losses.append(loss_val) 56 val_bleu_scores.append(bleu_val) 57 print ("\n", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, bleu score: {:.3f} ".format(step+1, loss_val, np.exp(loss_val), bleu_val)) 58 59 # 储存模型 60 save_model(model, optimizer, store_model_path, step+1) 61
测试并保存结果
1 load_model_path = "./ckpt/model_6000" # 读取模型位置 2 3 model = load_model(model, load_model_path) # 读取模型 4 model.to(device) 5 model.eval() 6 # 测试模型 7 loss_test, bleu_test= 0.0, 0.0 8 n = 0 9 result = [] 10 for sources_test, targets_test in test_loader: 11 sources_test, targets_test = sources_test.to(device), targets_test.to(device) 12 batch_size = sources_test.size(0) 13 # print(batch_size) 14 outputs_test, preds_test = model.inference(sources_test, targets_test) 15 # targets 的第一个 token 是 '<BOS>' 所以忽略 16 outputs_test = outputs_test[:, 1:].reshape(-1, outputs_test.size(2)) 17 targets_test = targets_test[:, 1:].reshape(-1) 18 loss = loss_function(outputs_test, targets_test) 19 loss_test += loss.item() 20 21 # 将预测结果转为文字 22 targets_test = targets_test.view(sources_test.size(0), -1) 23 preds_test = tokens2sentence(preds_test, int2word_cn) 24 sources_test = tokens2sentence(sources_test, int2word_en) 25 targets_test = tokens2sentence(targets_test, int2word_cn) 26 for source, pred, target in zip(sources_test, preds_test, targets_test): 27 result.append((source, pred, target)) 28 # 计算 Bleu Score 29 bleu_test += computebleu(preds_test, targets_test) 30 n += batch_size 31 loss_test = loss_test/len(test_loader) 32 bleu_test = bleu_test/n 33 print ('test loss: {}, bleu_score: {}'.format(loss_test,bleu_test)) 34 # 储存结果 35 with open('./test_output.txt', 'w') as f: 36 for line in result: 37 print (line, file=f)
浙公网安备 33010602011771号