【代码精读】Position-aware Attention and Supervised Data Improve Slot Filling

因为这篇文章采用LSTM+Attention设计了一个简单的关系抽取模型，我们只关注模型部分，其余数据处理与训练测试部分与https://www.cnblogs.com/Harukaze/p/14253446.html这篇文章的代码类似不做过多介绍。

底层LSTM模型以及embedding的设置：

  1 class RelationModel(object):
  2     """ A wrapper class for the training and evaluation of models.
  3         用于训练和评估模型的包装类"""
  4     def __init__(self, opt, emb_matrix=None):
  5         self.opt = opt
  6         self.model = PositionAwareRNN(opt, emb_matrix)
  7         self.criterion = nn.CrossEntropyLoss()
  8         self.parameters = [p for p in self.model.parameters() if p.requires_grad]
  9         if opt['cuda']:
 10             self.model.cuda()
 11             self.criterion.cuda()
 12         self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
 13     
 14     def update(self, batch):
 15         """ Run a step of forward and backward model update. """
 16         if self.opt['cuda']:
 17             inputs = [b.cuda() for b in batch[:7]]
 18             labels = batch[7].cuda()
 19         else:
 20             inputs = [b for b in batch[:7]]
 21             labels = batch[7]
 22 
 23         # step forward
 24         self.model.train()
 25         self.optimizer.zero_grad()
 26         logits, _ = self.model(inputs)    #logits.shape = [b,42]
 27         loss = self.criterion(logits, labels)    #labels = [b]
 28         
 29         # backward
 30         loss.backward()
 31         torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm'])
 32         self.optimizer.step()
 33         loss_val = loss.data.item()
 34         return loss_val
 35 
 36     def predict(self, batch, unsort=True):
 37         """ Run forward prediction. If unsort is True, recover the original order of the batch. """
 38         if self.opt['cuda']:
 39             inputs = [b.cuda() for b in batch[:7]]
 40             labels = batch[7].cuda()
 41         else:
 42             inputs = [b for b in batch[:7]]
 43             labels = batch[7]
 44 
 45         orig_idx = batch[8]
 46 
 47         # forward
 48         self.model.eval()
 49         logits, _ = self.model(inputs)
 50         loss = self.criterion(logits, labels)     #logits.shape = [b,42]  labels = [b]
 51         probs = F.softmax(logits, dim=1).data.cpu().numpy().tolist()
 52         predictions = np.argmax(logits.data.cpu().numpy(), axis=1).tolist() #predictions.shape = [b]
 53         if unsort:
 54             _, predictions, probs = [list(t) for t in zip(*sorted(zip(orig_idx,\
 55                     predictions, probs)))]
 56         return predictions, probs, loss.data.item()
 57 
 58     def update_lr(self, new_lr):
 59         torch_utils.change_lr(self.optimizer, new_lr)
 60 
 61     def save(self, filename, epoch):
 62         params = {
 63                 'model': self.model.state_dict(),
 64                 'config': self.opt,
 65                 'epoch': epoch
 66                 }
 67         try:
 68             torch.save(params, filename)
 69             print("model saved to {}".format(filename))
 70         except BaseException:
 71             print("[Warning: Saving failed... continuing anyway.]")
 72 
 73     def load(self, filename):
 74         try:
 75             checkpoint = torch.load(filename)
 76         except BaseException:
 77             print("Cannot load model from {}".format(filename))
 78             exit()
 79         self.model.load_state_dict(checkpoint['model'])
 80         self.opt = checkpoint['config']
 81 
 82 class PositionAwareRNN(nn.Module):
 83     """ A sequence model for relation extraction. """
 84 
 85     def __init__(self, opt, emb_matrix=None):
 86         super(PositionAwareRNN, self).__init__()
 87         self.drop = nn.Dropout(opt['dropout'])
 88         self.emb = nn.Embedding(opt['vocab_size'], opt['emb_dim'], padding_idx=constant.PAD_ID)
 89         if opt['pos_dim'] > 0:
 90             self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), opt['pos_dim'],
 91                     padding_idx=constant.PAD_ID)
 92         if opt['ner_dim'] > 0:
 93             self.ner_emb = nn.Embedding(len(constant.NER_TO_ID), opt['ner_dim'],
 94                     padding_idx=constant.PAD_ID)
 95         
 96         input_size = opt['emb_dim'] + opt['pos_dim'] + opt['ner_dim']
 97         self.rnn = nn.LSTM(input_size, opt['hidden_dim'], opt['num_layers'], batch_first=True,\
 98                 dropout=opt['dropout'])
 99         self.linear = nn.Linear(opt['hidden_dim'], opt['num_class'])
100 
101         if opt['attn']:
102             self.attn_layer = layers.PositionAwareAttention(opt['hidden_dim'],
103                     opt['hidden_dim'], 2*opt['pe_dim'], opt['attn_dim'])
104             self.pe_emb = nn.Embedding(constant.MAX_LEN * 2 + 1, opt['pe_dim'])
105             #pe_dim = Position encoding dimension = 30
106 
107         self.opt = opt
108         self.topn = self.opt.get('topn', 1e10)
109         self.use_cuda = opt['cuda']
110         self.emb_matrix = emb_matrix
111         self.init_weights()
112     
113     def init_weights(self):
114         if self.emb_matrix is None:
115             self.emb.weight.data[1:,:].uniform_(-1.0, 1.0) # keep padding dimension to be 0
116         else:
117             self.emb_matrix = torch.from_numpy(self.emb_matrix)
118             self.emb.weight.data.copy_(self.emb_matrix)
119         if self.opt['pos_dim'] > 0:
120             self.pos_emb.weight.data[1:,:].uniform_(-1.0, 1.0)
121         if self.opt['ner_dim'] > 0:
122             self.ner_emb.weight.data[1:,:].uniform_(-1.0, 1.0)
123 
124         self.linear.bias.data.fill_(0)
125         init.xavier_uniform_(self.linear.weight, gain=1) # initialize linear layer
126         if self.opt['attn']:
127             self.pe_emb.weight.data.uniform_(-1.0, 1.0)
128 
129         # decide finetuning
130         if self.topn <= 0:
131             print("Do not finetune word embedding layer.")
132             self.emb.weight.requires_grad = False
133         elif self.topn < self.opt['vocab_size']:
134             print("Finetune top {} word embeddings.".format(self.topn))
135             self.emb.weight.register_hook(lambda x: \
136                     torch_utils.keep_partial_grad(x, self.topn))
137         else:
138             print("Finetune all embeddings.")
139 
140     def zero_state(self, batch_size): 
141         state_shape = (self.opt['num_layers'], batch_size, self.opt['hidden_dim'])
142         h0 = c0 = torch.zeros(*state_shape, requires_grad=False)
143         if self.use_cuda:
144             return h0.cuda(), c0.cuda()
145         else:
146             return h0, c0
147     
148     def forward(self, inputs):
149         words, masks, pos, ner, deprel, subj_pos, obj_pos = inputs # unpack
150         seq_lens = list(masks.data.eq(constant.PAD_ID).long().sum(1).squeeze())
151         batch_size = words.size()[0]
152         
153         # embedding lookup
154         word_inputs = self.emb(words)   #word_inputs.shape = [b,maxlen] -> [b,maxlen,300]
155         inputs = [word_inputs]
156         if self.opt['pos_dim'] > 0:
157             inputs += [self.pos_emb(pos)]
158         if self.opt['ner_dim'] > 0:
159             inputs += [self.ner_emb(ner)]
160         inputs = self.drop(torch.cat(inputs, dim=2)) # add dropout to input  #[b,maxlen,300+30+30]
161         input_size = inputs.size(2)
162         
163         # rnn
164         h0, c0 = self.zero_state(batch_size)
165         inputs = nn.utils.rnn.pack_padded_sequence(inputs, seq_lens, batch_first=True)
166         outputs, (ht, ct) = self.rnn(inputs, (h0, c0))   #putput.shape = [b,maxlen,200]
167         outputs, output_lens = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
168         hidden = self.drop(ht[-1,:,:]) # get the outmost layer h_n    #获得hn的最后一层的值用来dropout
169         outputs = self.drop(outputs)
170         
171         # attention
172         if self.opt['attn']:
173             # convert all negative PE numbers to positive indices
174             # e.g., -2 -1 0 1 will be mapped to 98 99 100 101
175             subj_pe_inputs = self.pe_emb(subj_pos + constant.MAX_LEN)
176             obj_pe_inputs = self.pe_emb(obj_pos + constant.MAX_LEN)     #我看很多文章此类特征均都转为正数处理
177             pe_features = torch.cat((subj_pe_inputs, obj_pe_inputs), dim=2) #pe_features.shape = [b,maxlen,30+30]
178             final_hidden = self.attn_layer(outputs, masks, hidden, pe_features)
179         else:
180             final_hidden = hidden  #final_hidden.shape = [b,200]
181 
182         logits = self.linear(final_hidden)  #[1,b,200]->[b,42]
183         return logits, final_hidden

从第178行代码可以看到将，双层单向LSTM的输出outputs，mask矩阵，LSTM的$h_n[-1]$，以及处理过的位置向量pre_features输入到Attention层中。

1 import torch
2 a = [torch.Tensor([1,2,3]),torch.Tensor([1,2,3]),torch.Tensor([1,2,3])]
3 print(sum(a)) #tensor([3., 6., 9.])

补充一个sum()实例，便于下面Attention层sum()操作的理解：

 1 class PositionAwareAttention(nn.Module):
 2     """
 3     A position-augmented attention layer where the attention weight is
 4     a = T' . tanh(Ux + Vq + Wf)
 5     where x is the input, q is the query, and f is additional position features.
 6     """
 7     
 8     def __init__(self, input_size, query_size, feature_size, attn_size):
 9         super(PositionAwareAttention, self).__init__()
10         self.input_size = input_size        #input_size = hidden = 200
11         self.query_size = query_size        #input_size = hidden = 200
12         self.feature_size = feature_size    #feature_size = 2*30 = 60
13         self.attn_size = attn_size          #attn_size = attn_dim =200
14         self.ulinear = nn.Linear(input_size, attn_size)             #matrix U.shape = [200,200]
15         self.vlinear = nn.Linear(query_size, attn_size, bias=False) #matrix V.shape = [200,200]
16         if feature_size > 0:
17             self.wlinear = nn.Linear(feature_size, attn_size, bias=False)   #matrix W.shape = [60,200]
18         else:
19             self.wlinear = None
20         self.tlinear = nn.Linear(attn_size, 1)
21         self.init_weights()
22 
23     def init_weights(self):
24         self.ulinear.weight.data.normal_(std=0.001)
25         self.vlinear.weight.data.normal_(std=0.001)
26         if self.wlinear is not None:
27             self.wlinear.weight.data.normal_(std=0.001)
28         self.tlinear.weight.data.zero_() # use zero to give uniform attention at the beginning
29         #在开始时用零来给予每个位置相同的attention权重
30     
31     def forward(self, x, x_mask, q, f):
32         """
33         x : batch_size * seq_len * input_size
34         q : batch_size * query_size
35         f : batch_size * seq_len * feature_size
36         """
37         batch_size, seq_len, _ = x.size()
38 
39         x_proj = self.ulinear(x.contiguous().view(-1, self.input_size)).view(
40             batch_size, seq_len, self.attn_size)
41         # x.shape:[b,maxlen,200]->[b*maxlen,200]->[b*maxlen,200]->[b,maxlen,200] 此过程相当于Ux
42         q_proj = self.vlinear(q.view(-1, self.query_size)).contiguous().view(
43             batch_size, self.attn_size).unsqueeze(1).expand(
44                 batch_size, seq_len, self.attn_size)
45         #q.shape:[b,200]->[b,200]->[b,200]->[b,200]->[b,1,200]->[b,maxlen,200] 此过程相当于 Vq
46         #[b,1,200]->[b,maxlen,200]将代表整句话的概要向量summary vector q,复制maxlen，让每个位置单词都能用上
47         if self.wlinear is not None:
48             f_proj = self.wlinear(f.view(-1, self.feature_size)).contiguous().view(
49                 batch_size, seq_len, self.attn_size)
50             # f.shape:[b,maxlen,60]->[b*maxlen,60]->[b*maxlen,200]->[b,maxlen,200] 此过程相当于Wf
51             #[b*maxlen,60]->[b*maxlen,200]相当于把所有batch的单词连在一起与矩阵相乘，得到结果再改变shape
52 
53             projs = [x_proj, q_proj, f_proj]
54         else:
55             projs = [x_proj, q_proj]
56         scores = self.tlinear(torch.tanh(sum(projs)).view(-1, self.attn_size)).view(
57             batch_size, seq_len)
58         '''
59         sum(projs)三个矩阵对应位置数值相加 shape:[b,maxlen,200],调整为[b*maxlen,200]
60         经过一个激活函数tanh,再乘以一个矩阵T(tlinear),得到[b*maxlen,1]
61         最后输出为scores.shape:[b,maxlen]，意为每个batch中每个单词位置的attention权重值
62         '''
63 
64         # mask padding
65         scores.data.masked_fill_(x_mask.data, -float('inf')) #无单词的位置替换为-inf，不分配注意力权重
66         weights = F.softmax(scores, dim=1)    #weights.shape:[b,maxlen]
67         # weighted average input vectors
68         outputs = weights.unsqueeze(1).bmm(x).squeeze(1)
69         #weights.shape:[b,maxlen]->[b,1,maxlen]
70         #attention矩阵再与输入矩阵x相乘[b,1,maxlen]*[b,maxlen,200] = [b,1,200]->[b,200]
71         # z=\sum_{i=1}^n a_ih_i 这里相当于还原了公式得到向量z   maxlen每个单词位置上的权重*每个单词位置上的hidden
72         return outputs

posted on 2021-01-17 01:03 Harukaze 阅读(300) 评论(0) 收藏举报

刷新页面返回顶部

Harukaze

【代码精读】Position-aware Attention and Supervised Data Improve Slot Filling

导航

公告