nlp-tutorial学习笔记1.2
nlp-tutorial学习笔记1.2
NLP-tutorial是github上关于nlp入门的代码实战项目,主要是用pytorch实现了一些NLP中的模型。本人作为NLP萌新记录一下学习过程中的感悟。主要只是从代码角度出发,并没有去解读论文。😃
1.2Word2Vec
总体模型思想:
在句子中随机挑选目标词作为输入,label上下文附近的词语。在本模型中是随机选择一个单词,然后模型输出是他的上一个词或者下一个词。(具体是上一个词还是下一个词是随机选择得来,步骤可以看数据处理半部分)
模型设置:
class Word2Vec(nn.Module):
def __init__(self):
super(Word2Vec, self).__init__()
#W和WT两个并不是转置关系,只是维度上看起来是转置而已
self.W = nn.Linear(voc_size, embedding_size, bias=False) # voc_size > embedding_size Weight
self.WT = nn.Linear(embedding_size, voc_size, bias=False) # embedding_size > voc_size Weight
def forward(self, X):
# X : [batch_size, voc_size]
#输入每个样本就是一个独热向量
hidden_layer = self.W(X) # hidden_layer : [batch_size, embedding_size]
#每个样本最终输出的也是一个长度为voc_size的向量,最后就是和分类结果用nn.CrossEntropyLoss()求loss
output_layer = self.WT(hidden_layer) # output_layer : [batch_size, voc_size]
return output_layer
数据处理部分:
#从生成的skip_grams中随机采样生成input和label
def random_batch():
random_inputs = []
random_labels = []
#随机选取batch_size个二元组
random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
for i in random_index:
#对于选取的某一个二元组比如[2,1],将第一个数转换为独热向量,2->(0,0,1,0....0)
random_inputs.append(np.eye(voc_size)[skip_grams[i][0]])
#label就是二元组中的第二个数,也就是之前构造二元组时与第一个数对应词相邻的单词
random_labels.append(skip_grams[i][1])
return random_inputs, random_labels
if __name__ == '__main__':
batch_size = 2 # mini-batch size
embedding_size = 2 # embedding size
sentences = ["apple banana fruit", "banana orange fruit", "orange banana fruit",
"dog cat animal", "cat monkey animal", "monkey dog animal"]
#将所有句子的单词都拼在一起,所以比如第三句和第四句来看的话,dog的上下文不仅有cat还有fruit
word_sequence = " ".join(sentences).split()
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
voc_size = len(word_list)
# 这里就是对于所有单词,都构造它以及和它相邻的单词转换成数字后的二元组
#比如word_list中apple:0, banana:1,fruit:2;那么skip_grams中有[1,0],[1,2]
skip_grams = []
for i in range(1, len(word_sequence) - 1):
target = word_dict[word_sequence[i]]
context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]
for w in context:
skip_grams.append([target, w])
#这里就调用random_batch每次epoch随机采样的到input和label
for epoch in range(5000):
input_batch, target_batch = random_batch()
input_batch = torch.Tensor(input_batch)
target_batch = torch.LongTensor(target_batch)
后面没啥好讲的,就是训练加梯度下降
完整代码在https://github.com/graykode/nlp-tutorial
我这里也贴一下(不是我写的,我搬运的)
# %%
# code by Tae Hwan Jung @graykode
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
def random_batch():
random_inputs = []
random_labels = []
random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
for i in random_index:
random_inputs.append(np.eye(voc_size)[skip_grams[i][0]]) # target
random_labels.append(skip_grams[i][1]) # context word
return random_inputs, random_labels
# Model
class Word2Vec(nn.Module):
def __init__(self):
super(Word2Vec, self).__init__()
# W and WT is not Traspose relationship
self.W = nn.Linear(voc_size, embedding_size, bias=False) # voc_size > embedding_size Weight
self.WT = nn.Linear(embedding_size, voc_size, bias=False) # embedding_size > voc_size Weight
def forward(self, X):
# X : [batch_size, voc_size]
hidden_layer = self.W(X) # hidden_layer : [batch_size, embedding_size]
output_layer = self.WT(hidden_layer) # output_layer : [batch_size, voc_size]
return output_layer
if __name__ == '__main__':
batch_size = 2 # mini-batch size
embedding_size = 2 # embedding size
sentences = ["apple banana fruit", "banana orange fruit", "orange banana fruit",
"dog cat animal", "cat monkey animal", "monkey dog animal"]
word_sequence = " ".join(sentences).split()
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
voc_size = len(word_list)
# Make skip gram of one size window
skip_grams = []
for i in range(1, len(word_sequence) - 1):
target = word_dict[word_sequence[i]]
context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]
for w in context:
skip_grams.append([target, w])
model = Word2Vec()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training
for epoch in range(5000):
input_batch, target_batch = random_batch()
input_batch = torch.Tensor(input_batch)
target_batch = torch.LongTensor(target_batch)
optimizer.zero_grad()
output = model(input_batch)
# output : [batch_size, voc_size], target_batch : [batch_size] (LongTensor, not one-hot)
loss = criterion(output, target_batch)
if (epoch + 1) % 1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
loss.backward()
optimizer.step()
for i, label in enumerate(word_list):
W, WT = model.parameters()
x, y = W[0][i].item(), W[1][i].item()
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

浙公网安备 33010602011771号