pytorch 实现 Seq2Seq 机器翻译

本文主要介绍了用 pytorch 实现简单的 Seq2Seq 机器翻译任务,参考了 李宏毅老师的深度学习课程 的作业八,数据集也来自于此,视频课程可以在B站学习(https://www.bilibili.com/video/BV1JE411g7XF?p=53)。算法理论可以阅读论文"Sequence to Sequence Learning with Neural Networks",也可以参考 我写的论文阅读笔记 

import 需要使用的模块

 1 import torch
 2 import torch.nn as nn
 3 import torch.utils.data as data
 4 import torchsummary
 5 from torchvision import datasets
 6 import numpy as np
 7 import sys
 8 import os
 9 import random
10 import re
11 import json
12 from nltk.translate.bleu_score import sentence_bleu
13 
14 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
View Code

设置需要的配置参数

 1 data_path = "./cmn-eng"          # 数据集的位置
 2 store_model_path = "./ckpt"      # 储存模型的位置
 3 max_output_len = 45              # 输出句子的最大长度
 4 batch_size = 64                  # batch_size
 5 emb_dim = 256                    # word embedding向量的维度
 6 hid_dim = 512                    # RNN隐藏状态的维度
 7 n_layers = 4                     # RNN的层数
 8 dropout = 0.5                    # dropout的概率p
 9 learning_rate = 0.0001           # 初始化学习率
10 teacher_forcing_ratio = 0.5      # 使用正解训练的概率
11 summary_steps = 6000             # 总训练batch数
View Code

从下载的数据文件中导入数据集和中英文字典

 1 # 载入字典
 2 def get_dictionary(root, language):
 3     with open(os.path.join(root, 'word2int_{}.json'.format(language)), "r") as f:
 4         word2int = json.load(f)
 5     with open(os.path.join(root, 'int2word_{}.json'.format(language)), "r") as f:
 6         int2word = json.load(f)
 7     print('{} vocab size: {}'.format(language, len(word2int)))
 8     return word2int, int2word, len(word2int)
 9 
10 word2int_cn, int2word_cn, cn_vocab_size = get_dictionary(data_path, 'cn') # 中文字典
11 word2int_en, int2word_en, en_vocab_size = get_dictionary(data_path, 'en') # 英文字典
12 vocab=[word2int_cn, int2word_cn, word2int_en, int2word_en]
13 
14 # 载入数据 (training/validation/testing)
15 def load_data(root, set_name):
16     data = []
17     with open(os.path.join(root, '{}.txt'.format(set_name)), "r") as f:
18         for line in f:
19             data.append(line)
20     print('{} dataset size: {}'.format(set_name,len(data)))
21     
22     return data
23 
24 training_data=load_data(data_path, 'training')
25 val_data=load_data(data_path, 'validation')
26 testing_data=load_data(data_path, 'testing')
27 
28 '''
29 打印结果
30 cn vocab size: 3805
31 en vocab size: 3922
32 training dataset size: 18000
33 validation dataset size: 500
34 testing dataset size: 2636
35 '''
View Code

从输出结果可以看到,中英文字典中的词汇量分别为3805、3922,训练集、验证集、测试集中的句子数量分别为18000、500、2636。

对数据集进行一些处理,也就是继承 torch.utils.data.Dataset 类,然后实现里面的 __len__ 以及 __getitem__ 方法

 1 class EN2CNDataset(data.Dataset):
 2     def __init__(self, data, max_output_len, vocab):
 3         self.max_output_len = max_output_len
 4         self.word2int_cn, self.int2word_cn = vocab[0], vocab[1] # 中文字典
 5         self.word2int_en, self.int2word_en = vocab[2], vocab[3] # 英文字典
 6         self.data = data
 7 
 8         self.cn_vocab_size = len(self.word2int_cn)
 9         self.en_vocab_size = len(self.word2int_en)
10 
11     def seq_pad(self, label, pad_token):
12         # 将不同长度的句子pad到相同长度,以便训练
13         label = np.pad(label, (0, (self.max_output_len - label.shape[0])), mode='constant', constant_values=pad_token)
14         return label
15 
16     def __len__(self):
17         return len(self.data)
18 
19     def __getitem__(self, Index):
20         # 将英文句子和中文句子分开
21         sentences = self.data[Index]
22         sentences = re.split('[\t\n]', sentences)
23         sentences = list(filter(None, sentences))
24         #print (sentences)
25         assert len(sentences) == 2
26 
27         # 特殊token
28         BOS = self.word2int_en['<BOS>']
29         EOS = self.word2int_en['<EOS>']
30         UNK = self.word2int_en['<UNK>']
31 
32         # 在句子开头添加'<BOS>',结尾添加'<EOS>',字典中没有的词标记为'<UNK>'
33         en, cn = [BOS], [BOS]
34         # 英文句子分词后转为字典索引向量
35         sentence = re.split(' ', sentences[0])
36         sentence = list(filter(None, sentence))
37         for word in sentence:
38             en.append(self.word2int_en.get(word, UNK))
39         en.append(EOS)
40 
41         # 中文句子分词后转为字典索引向量
42         # e.g. < BOS >, we, are, friends, < EOS > --> 1, 28, 29, 205, 2
43         sentence = re.split(' ', sentences[1])
44         sentence = list(filter(None, sentence))
45         for word in sentence:
46             cn.append(self.word2int_cn.get(word, UNK))
47         cn.append(EOS)
48 
49         en, cn = np.asarray(en), np.asarray(cn)
50         #if len(en)>30 or len(cn)>30:
51         #    print(len(en),len(cn)) 
52 
53         # 用 '<PAD>' 将句子pad到相同长度
54         en = self.seq_pad(en, self.word2int_en['<PAD>'])
55         cn = self.seq_pad(cn, self.word2int_cn['<PAD>'])
56         en, cn = torch.LongTensor(en), torch.LongTensor(cn)
57 
58         # return英文和中文句子的向量
59         return en, cn
60 
61 train_dataset = EN2CNDataset(training_data, max_output_len, vocab)
62 val_dataset = EN2CNDataset(val_data, max_output_len, vocab)
63 test_dataset = EN2CNDataset(testing_data, max_output_len, vocab)
64 train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
65 val_loader = data.DataLoader(val_dataset, batch_size=1)
66 test_loader = data.DataLoader(test_dataset, batch_size=1)
View Code

主要处理为:在每个句子的首尾加上开始和结束符号 ('<BOS>' & '<EOS>') ;用 '<UNK>' 符号代替句子中没在字典中出现的词,表示生词;将每个句子用 '<PAD>' 符号补充到相同长度 max_output_len ;将句子中的词转换为字典中的index,以index序列的形式存储;中英文序列分开存储。

定义 Encoder 和 Decoder 类

 1 class Encoder(nn.Module):
 2     def __init__(self, en_vocab_size, emb_dim, hid_dim, n_layers, dropout):
 3         super().__init__()
 4         self.hid_dim = hid_dim
 5         self.n_layers = n_layers
 6         self.embedding = nn.Embedding(en_vocab_size, emb_dim)
 7         self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, batch_first=True, bidirectional=True)
 8         self.dropout = nn.Dropout(dropout)
 9 
10     def forward(self, input):
11         # input = [batch size, sequence len, en_vocab_size]
12         embedding = self.embedding(input)
13         outputs, hidden = self.rnn(self.dropout(embedding))
14         # outputs = [batch size, sequence len, hid_dim * directions]
15         # hidden =  [n_layers * directions, batch size  , hid_dim]
16 
17         return outputs, hidden
18     
19 class Decoder(nn.Module):
20     def __init__(self, cn_vocab_size, emb_dim, hid_dim, n_layers, dropout):
21         super().__init__()
22         self.cn_vocab_size = cn_vocab_size
23         self.hid_dim = hid_dim * 2 #因为 Encoder 是双向的
24         self.n_layers = n_layers
25         self.embedding = nn.Embedding(cn_vocab_size, emb_dim)
26         self.input_dim = emb_dim
27         self.rnn = nn.GRU(self.input_dim, self.hid_dim, n_layers, dropout = dropout, batch_first=True)
28         self.embedding2vocab1 = nn.Linear(self.hid_dim, self.hid_dim * 2)
29         self.embedding2vocab2 = nn.Linear(self.hid_dim * 2, self.hid_dim * 4)
30         self.embedding2vocab3 = nn.Linear(self.hid_dim * 4, self.cn_vocab_size)
31         self.dropout = nn.Dropout(dropout)
32 
33     def forward(self, input, hidden):
34         # input = [batch size, cn_vocab_size]
35         # hidden = [batch size, n_layers * directions, hid_dim]
36         # Decoder 是单向,所以 directions=1
37         input = input.unsqueeze(1)
38         embedded = self.dropout(self.embedding(input))
39         # embedded = [batch size, 1, emb_dim]
40         output, hidden = self.rnn(embedded, hidden)
41         # output = [batch size, 1, hid_dim]
42         # hidden = [n_layers, batch size, hid_dim]
43 
44         # 将 RNN 的输出向量的维数转换到target语言的字典大小
45         output = self.embedding2vocab1(output.squeeze(1))
46         output = self.embedding2vocab2(output)
47         prediction = self.embedding2vocab3(output)
48         # prediction = [batch size, vocab size]
49         return prediction, hidden
View Code

可以看出,Encoder 是一个双向四层的GRU,Decoder 是一个单向四层的GRU。Encoder 里 RNN 的隐藏状态输出用来初始化 Decoder 里的 RNN,Decoder 的输出用来计算损失值 loss 。

定义 Seq2Seq 类,定义模型、优化器和损失函数

 1 class Seq2Seq(nn.Module):
 2     def __init__(self, encoder, decoder, device):
 3         super().__init__()
 4         self.encoder = encoder
 5         self.decoder = decoder
 6         self.device = device
 7     def forward(self, input, target, teacher_forcing_ratio):
 8         # input  = [batch size, input len, vocab size]
 9         # target = [batch size, target len, vocab size]
10         # teacher_forcing_ratio 是使用正解训练的概率
11         batch_size = target.shape[0]
12         target_len=target.shape[1]
13         vocab_size = self.decoder.cn_vocab_size
14         
15         # 准备一个tensor存储输出
16         outputs = torch.zeros(batch_size, target_len, vocab_size).to(self.device)
17         
18         encoder_outputs, hidden = self.encoder(input)
19         # Encoder 最后的隐藏状态(hidden state)用来初始化 Decoder
20         # 因为 Encoder 是双向RNN,所以需要对同一层两个方向的 hidden state 进行拼接
21         # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim]
22         hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
23         # hidden = [num_layers, batch size, hid dim * 2]
24         hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
25         
26         dec_input = target[:, 0] # 'BOS'
27         preds = []
28         for t in range(1, target_len):
29             output, hidden = self.decoder(dec_input, hidden)
30             outputs[:, t] = output
31             # 决定是否用正解来训练
32             teacher_force = random.random() <= teacher_forcing_ratio
33             # 取出输出概率最大的词
34             top1 = output.argmax(1)
35             # teacher force 为 True 用正解训练,否则用预测到的最大概率的词训练
36             dec_input = target[:, t] if teacher_force and t < target_len else top1
37             preds.append(top1.unsqueeze(1))
38         preds = torch.cat(preds, 1)
39         return outputs, preds
40     def inference(self, input, target):
41         # 测试模型
42         # input  = [batch size, input len, vocab size]
43         # target = [batch size, target len, vocab size]
44         batch_size = input.shape[0]
45         input_len = input.shape[1]
46         vocab_size = self.decoder.cn_vocab_size
47         
48         outputs = torch.zeros(batch_size, input_len, vocab_size).to(self.device)
49         encoder_outputs, hidden = self.encoder(input)
50         # Encoder 最后的隐藏状态(hidden state)用来初始化 Decoder
51         # 因为 Encoder 是双向RNN,所以需要对同一层两个方向的 hidden state 进行拼接
52         # hidden = [num_layers * directions, batch size, hid dim] --> [num_layers, directions, batch size, hid dim]
53         hidden = hidden.view(self.encoder.n_layers, 2, batch_size, -1)
54         hidden = torch.cat((hidden[:, -2, :, :], hidden[:, -1, :, :]), dim=2)
55         
56         dec_input = target[:, 0] # 'BOS'
57         preds = []
58         for t in range(1, input_len):
59             output, hidden = self.decoder(dec_input, hidden)
60             outputs[:, t] = output
61             # 取出输出概率最大的词
62             top1 = output.argmax(1)
63             # 用预测到的最大概率的词训练
64             dec_input = top1
65             preds.append(top1.unsqueeze(1))
66         preds = torch.cat(preds, 1)
67         return outputs, preds
68     
69 encoder = Encoder(en_vocab_size, emb_dim, hid_dim, n_layers, dropout)
70 decoder = Decoder(cn_vocab_size, emb_dim, hid_dim, n_layers, dropout)
71 model = Seq2Seq(encoder, decoder, device).to(device)
72 print(model)
73 loss_function = nn.CrossEntropyLoss(ignore_index=0).to(device)
74 optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
75 print(optimizer)
76 print('num of parameters: ', sum(p.numel() for p in model.parameters() if p.requires_grad))
View Code

可以看出,Decoder 是逐词预测的,训练时使用 teacher_forcing 机制,验证和测试时不使用。损失函数中有一个参数 ignore_index=0 ,表示忽略预测分类为 0 时的 loss,因为此处分类为 0 时表示预测词为 '<PAD>' 。

定义一些有用的函数,用来存储、读取网络参数,计算 bleu score,将预测结果转为文字,创建训练 batch 的迭代器

 1 def save_model(model, optimizer, store_model_path, step):
 2     torch.save(model.state_dict(), '{}/model_{}.ckpt'.format(store_model_path,step))
 3     return
 4 
 5 def load_model(model, load_model_path):
 6     print('Load model from {}'.format(load_model_path))
 7     model.load_state_dict(torch.load('{}.ckpt'.format(load_model_path)))
 8     return model
 9 
10 def computebleu(sentences, targets):
11     score = 0 
12     assert (len(sentences) == len(targets))
13 
14     def cut_token(sentence):
15         tmp = []
16         for token in sentence:
17             if token == '<UNK>' or token.isdigit() or len(bytes(token[0], encoding='utf-8')) == 1:
18                 tmp.append(token)
19             else:
20                 tmp += [word for word in token]
21         return tmp 
22 
23     for sentence, target in zip(sentences, targets):
24         sentence = cut_token(sentence)
25         target = cut_token(target)
26         score += sentence_bleu([target], sentence, weights=(1, 0, 0, 0))
27     return score
28 
29 def tokens2sentence(outputs, int2word):
30     sentences = []
31     for tokens in outputs:
32         sentence = []
33         for token in tokens:
34             word = int2word[str(int(token))]
35             if word == '<EOS>':
36                 break
37             sentence.append(word)
38         sentences.append(sentence)
39     return sentences
40 
41 def infinite_iter(data_loader):
42     it = iter(data_loader)
43     while True:
44         try:
45             ret = next(it)
46             yield ret
47         except StopIteration:
48             it = iter(data_loader)
View Code

训练与验证,每300轮 batch 训练进行验证,并存储模型

 1 model.train()
 2 model.zero_grad()
 3 train_losses, val_losses, val_bleu_scores = [], [], []
 4 loss_sum = 0.0
 5 train_iter = infinite_iter(train_loader)
 6 
 7 for step in range(summary_steps):
 8     model.train()
 9     sources, targets = next(train_iter)
10     sources, targets = sources.to(device), targets.to(device)
11     outputs, preds = model(sources, targets, teacher_forcing_ratio)
12     # targets 的第一个 token 是 '<BOS>' 所以忽略
13     outputs = outputs[:, 1:].reshape(-1, outputs.size(2))
14     targets = targets[:, 1:].reshape(-1)
15     loss = loss_function(outputs, targets)
16     
17     optimizer.zero_grad()
18     loss.backward()
19     grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
20     optimizer.step()
21 
22     loss_sum += loss.item()
23     if (step + 1) % 10 == 0:
24         loss_sum = loss_sum / 10
25         print ("\r","train [{}] loss: {:.3f}, Perplexity: {:.3f}".format(step+1, loss_sum, np.exp(loss_sum)), end=" ")
26         train_losses.append(loss_sum)
27         loss_sum = 0.0
28         
29         if (step + 1) % 300 == 0:
30             # 每300轮 batch 训练进行验证,并存储模型
31             model.eval()
32             loss_val, bleu_val= 0.0, 0.0
33             n = 0
34             for sources_val, targets_val in val_loader:
35                 sources_val, targets_val = sources_val.to(device), targets_val.to(device)
36                 batch_size = sources_val.size(0)
37                 #print(batch_size)
38                 outputs_val, preds_val = model.inference(sources_val, targets_val)
39                 # targets 的第一个 token 是 '<BOS>' 所以忽略
40                 outputs_val = outputs_val[:, 1:].reshape(-1, outputs_val.size(2))
41                 targets_val = targets_val[:, 1:].reshape(-1)
42                 loss = loss_function(outputs_val, targets_val)
43                 loss_val += loss.item()
44                 
45                 # 将预测结果转为文字
46                 targets_val = targets_val.view(sources_val.size(0), -1)
47                 preds_val = tokens2sentence(preds_val, int2word_cn)
48                 sources_val = tokens2sentence(sources_val, int2word_en)
49                 targets_val = tokens2sentence(targets_val, int2word_cn)
50                 # 计算 Bleu Score
51                 bleu_val += computebleu(preds_val, targets_val)
52                 n += batch_size
53             loss_val = loss_val/len(val_loader)
54             bleu_val = bleu_val/n
55             val_losses.append(loss_val)
56             val_bleu_scores.append(bleu_val)
57             print ("\n", "val [{}] loss: {:.3f}, Perplexity: {:.3f}, bleu score: {:.3f} ".format(step+1, loss_val, np.exp(loss_val), bleu_val))
58 
59             # 储存模型
60             save_model(model, optimizer, store_model_path, step+1)
61         
View Code

测试并保存结果

 1 load_model_path = "./ckpt/model_6000"      # 读取模型位置
 2 
 3 model = load_model(model, load_model_path) # 读取模型
 4 model.to(device)
 5 model.eval()
 6 # 测试模型
 7 loss_test, bleu_test= 0.0, 0.0
 8 n = 0
 9 result = []
10 for sources_test, targets_test in test_loader:
11     sources_test, targets_test = sources_test.to(device), targets_test.to(device)
12     batch_size = sources_test.size(0)
13     # print(batch_size)
14     outputs_test, preds_test = model.inference(sources_test, targets_test)
15     # targets 的第一个 token 是 '<BOS>' 所以忽略
16     outputs_test = outputs_test[:, 1:].reshape(-1, outputs_test.size(2))
17     targets_test = targets_test[:, 1:].reshape(-1)
18     loss = loss_function(outputs_test, targets_test)
19     loss_test += loss.item()
20                 
21     # 将预测结果转为文字
22     targets_test = targets_test.view(sources_test.size(0), -1)
23     preds_test = tokens2sentence(preds_test, int2word_cn)
24     sources_test = tokens2sentence(sources_test, int2word_en)
25     targets_test = tokens2sentence(targets_test, int2word_cn)
26     for source, pred, target in zip(sources_test, preds_test, targets_test):
27         result.append((source, pred, target))
28     # 计算 Bleu Score
29     bleu_test += computebleu(preds_test, targets_test)
30     n += batch_size
31 loss_test = loss_test/len(test_loader)
32 bleu_test = bleu_test/n
33 print ('test loss: {}, bleu_score: {}'.format(loss_test,bleu_test))
34 # 储存结果
35 with open('./test_output.txt', 'w') as f:
36     for line in result:
37         print (line, file=f)
View Code

 

posted @ 2020-08-09 11:29  滑天下之大j  阅读(843)  评论(0)    收藏  举报