NLP-tutorial学习笔记1.1
NLP-tutorial是github上关于nlp入门的代码实战项目,主要是用pytorch实现了一些NLP中的模型。本人作为NLP萌新记录一下学习过程中的感悟。主要只是从代码角度出发,并没有去解读论文。😃
1.1NNLM
总体模型:对每个句子按词划分,词转换为对应词向量,用每个句子的前n-1个词预测最后一个词。
模型设置:
class NNLM(nn.Module):
def __init__(self):
super(NNLM, self).__init__()
self.C = nn.Embedding(n_class, m)
self.H = nn.Linear(n_step * m, n_hidden, bias=False)
self.d = nn.Parameter(torch.ones(n_hidden))
self.U = nn.Linear(n_hidden, n_class, bias=False)
self.W = nn.Linear(n_step * m, n_class, bias=False)
self.b = nn.Parameter(torch.ones(n_class))
def forward(self, X):
#嵌入层
X = self.C(X) # X : [batch_size, n_step, m]
#将一句话的前n-1个词的嵌入表示拼接在一起,其中n_step就是句子长度减1,m为词嵌入的长度
X = X.view(-1, n_step * m) # [batch_size, n_step * m]
#隐层全连接加偏置
tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]
#偏置加分类层全连接加隐层输出到分类层全连接
output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]
#最终输出是要接一个softmax才能成功分类
return output
数据处理部分:
sentences = ["i like dog", "i love coffee", "i hate milk"]
#产生单词到数字的字典
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}#单词到编号
number_dict = {i: w for i, w in enumerate(word_list)}#编号到单词
n_class = len(word_dict) # 单词总数
#创建输入batch的函数
def make_batch():
input_batch = []
target_batch = []
#对每个句子先将其按词划分,然后转换为编号,最后前n-1个数input,最后一个数是target
for sen in sentences:
word = sen.split() # space tokenizer
input = [word_dict[n] for n in word[:-1]] # create (1~n-1) as input
target = word_dict[word[-1]] # create (n) as target, We usually call this 'casual language model'
input_batch.append(input)
target_batch.append(target)
return input_batch, target_batch
#生成input_batch和target_batch
input_batch, target_batch = make_batch()
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)
然后再重点讲一下损失函数的设计,代码中使用的:
criterion = nn.CrossEntropyLoss()
训练过程中
loss = criterion(output,target_batch)
# output : [batch_size, n_class], target_batch : [batch_size]
这里可以看到output和target_batch维度不同,那是如何求loss的呢?
其实nn.CrossEntropyLoss()是nn.logSoftmax()和nn.NLLLoss()的整合:意思是先让output求softmax,然后在取log,然后再将向量中第target个数作为loss。比如分类为0类,然后output为(1,0,0),那第0个值求softmax在取log后为0,相当于loss为0非常正确。
nn.LogSoftmax()的对输入的操作就是:

nn.NLLLoss(output,c)的操作就是取向量第c个数。
完整代码在https://github.com/graykode/nlp-tutorial
我这里也贴一下(不是我写的,只是分享原作者写的代码)
# %%
# code by Tae Hwan Jung @graykode
import torch
import torch.nn as nn
import torch.optim as optim
def make_batch():
input_batch = []
target_batch = []
for sen in sentences:
word = sen.split() # space tokenizer
input = [word_dict[n] for n in word[:-1]] # create (1~n-1) as input
target = word_dict[word[-1]] # create (n) as target, We usually call this 'casual language model'
input_batch.append(input)
target_batch.append(target)
return input_batch, target_batch
# Model
class NNLM(nn.Module):
def __init__(self):
super(NNLM, self).__init__()
self.C = nn.Embedding(n_class, m)
self.H = nn.Linear(n_step * m, n_hidden, bias=False)
self.d = nn.Parameter(torch.ones(n_hidden))
self.U = nn.Linear(n_hidden, n_class, bias=False)
self.W = nn.Linear(n_step * m, n_class, bias=False)
self.b = nn.Parameter(torch.ones(n_class))
def forward(self, X):
X = self.C(X) # X : [batch_size, n_step, m]
X = X.view(-1, n_step * m) # [batch_size, n_step * m]
tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]
output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]
return output
if __name__ == '__main__':
n_step = 2 # number of steps, n-1 in paper
n_hidden = 2 # number of hidden size, h in paper
m = 2 # embedding size, m in paper
sentences = ["i like dog", "i love coffee", "i hate milk"]
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict) # number of Vocabulary
model = NNLM()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
input_batch, target_batch = make_batch()
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)
# Training
for epoch in range(5000):
optimizer.zero_grad()
output = model(input_batch)
# output : [batch_size, n_class], target_batch : [batch_size]
loss = criterion(output, target_batch)
if (epoch + 1) % 1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
loss.backward()
optimizer.step()
# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]
# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])
# %%

浙公网安备 33010602011771号