【代码精读】Position-aware Attention and Supervised Data Improve Slot Filling
因为这篇文章采用LSTM+Attention设计了一个简单的关系抽取模型,我们只关注模型部分,其余数据处理与训练测试部分与https://www.cnblogs.com/Harukaze/p/14253446.html这篇文章的代码类似不做过多介绍。
底层LSTM模型以及embedding的设置:
1 class RelationModel(object): 2 """ A wrapper class for the training and evaluation of models. 3 用于训练和评估模型的包装类""" 4 def __init__(self, opt, emb_matrix=None): 5 self.opt = opt 6 self.model = PositionAwareRNN(opt, emb_matrix) 7 self.criterion = nn.CrossEntropyLoss() 8 self.parameters = [p for p in self.model.parameters() if p.requires_grad] 9 if opt['cuda']: 10 self.model.cuda() 11 self.criterion.cuda() 12 self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr']) 13 14 def update(self, batch): 15 """ Run a step of forward and backward model update. """ 16 if self.opt['cuda']: 17 inputs = [b.cuda() for b in batch[:7]] 18 labels = batch[7].cuda() 19 else: 20 inputs = [b for b in batch[:7]] 21 labels = batch[7] 22 23 # step forward 24 self.model.train() 25 self.optimizer.zero_grad() 26 logits, _ = self.model(inputs) #logits.shape = [b,42] 27 loss = self.criterion(logits, labels) #labels = [b] 28 29 # backward 30 loss.backward() 31 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm']) 32 self.optimizer.step() 33 loss_val = loss.data.item() 34 return loss_val 35 36 def predict(self, batch, unsort=True): 37 """ Run forward prediction. If unsort is True, recover the original order of the batch. """ 38 if self.opt['cuda']: 39 inputs = [b.cuda() for b in batch[:7]] 40 labels = batch[7].cuda() 41 else: 42 inputs = [b for b in batch[:7]] 43 labels = batch[7] 44 45 orig_idx = batch[8] 46 47 # forward 48 self.model.eval() 49 logits, _ = self.model(inputs) 50 loss = self.criterion(logits, labels) #logits.shape = [b,42] labels = [b] 51 probs = F.softmax(logits, dim=1).data.cpu().numpy().tolist() 52 predictions = np.argmax(logits.data.cpu().numpy(), axis=1).tolist() #predictions.shape = [b] 53 if unsort: 54 _, predictions, probs = [list(t) for t in zip(*sorted(zip(orig_idx,\ 55 predictions, probs)))] 56 return predictions, probs, loss.data.item() 57 58 def update_lr(self, new_lr): 59 torch_utils.change_lr(self.optimizer, new_lr) 60 61 def save(self, filename, epoch): 62 params = { 63 'model': self.model.state_dict(), 64 'config': self.opt, 65 'epoch': epoch 66 } 67 try: 68 torch.save(params, filename) 69 print("model saved to {}".format(filename)) 70 except BaseException: 71 print("[Warning: Saving failed... continuing anyway.]") 72 73 def load(self, filename): 74 try: 75 checkpoint = torch.load(filename) 76 except BaseException: 77 print("Cannot load model from {}".format(filename)) 78 exit() 79 self.model.load_state_dict(checkpoint['model']) 80 self.opt = checkpoint['config'] 81 82 class PositionAwareRNN(nn.Module): 83 """ A sequence model for relation extraction. """ 84 85 def __init__(self, opt, emb_matrix=None): 86 super(PositionAwareRNN, self).__init__() 87 self.drop = nn.Dropout(opt['dropout']) 88 self.emb = nn.Embedding(opt['vocab_size'], opt['emb_dim'], padding_idx=constant.PAD_ID) 89 if opt['pos_dim'] > 0: 90 self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), opt['pos_dim'], 91 padding_idx=constant.PAD_ID) 92 if opt['ner_dim'] > 0: 93 self.ner_emb = nn.Embedding(len(constant.NER_TO_ID), opt['ner_dim'], 94 padding_idx=constant.PAD_ID) 95 96 input_size = opt['emb_dim'] + opt['pos_dim'] + opt['ner_dim'] 97 self.rnn = nn.LSTM(input_size, opt['hidden_dim'], opt['num_layers'], batch_first=True,\ 98 dropout=opt['dropout']) 99 self.linear = nn.Linear(opt['hidden_dim'], opt['num_class']) 100 101 if opt['attn']: 102 self.attn_layer = layers.PositionAwareAttention(opt['hidden_dim'], 103 opt['hidden_dim'], 2*opt['pe_dim'], opt['attn_dim']) 104 self.pe_emb = nn.Embedding(constant.MAX_LEN * 2 + 1, opt['pe_dim']) 105 #pe_dim = Position encoding dimension = 30 106 107 self.opt = opt 108 self.topn = self.opt.get('topn', 1e10) 109 self.use_cuda = opt['cuda'] 110 self.emb_matrix = emb_matrix 111 self.init_weights() 112 113 def init_weights(self): 114 if self.emb_matrix is None: 115 self.emb.weight.data[1:,:].uniform_(-1.0, 1.0) # keep padding dimension to be 0 116 else: 117 self.emb_matrix = torch.from_numpy(self.emb_matrix) 118 self.emb.weight.data.copy_(self.emb_matrix) 119 if self.opt['pos_dim'] > 0: 120 self.pos_emb.weight.data[1:,:].uniform_(-1.0, 1.0) 121 if self.opt['ner_dim'] > 0: 122 self.ner_emb.weight.data[1:,:].uniform_(-1.0, 1.0) 123 124 self.linear.bias.data.fill_(0) 125 init.xavier_uniform_(self.linear.weight, gain=1) # initialize linear layer 126 if self.opt['attn']: 127 self.pe_emb.weight.data.uniform_(-1.0, 1.0) 128 129 # decide finetuning 130 if self.topn <= 0: 131 print("Do not finetune word embedding layer.") 132 self.emb.weight.requires_grad = False 133 elif self.topn < self.opt['vocab_size']: 134 print("Finetune top {} word embeddings.".format(self.topn)) 135 self.emb.weight.register_hook(lambda x: \ 136 torch_utils.keep_partial_grad(x, self.topn)) 137 else: 138 print("Finetune all embeddings.") 139 140 def zero_state(self, batch_size): 141 state_shape = (self.opt['num_layers'], batch_size, self.opt['hidden_dim']) 142 h0 = c0 = torch.zeros(*state_shape, requires_grad=False) 143 if self.use_cuda: 144 return h0.cuda(), c0.cuda() 145 else: 146 return h0, c0 147 148 def forward(self, inputs): 149 words, masks, pos, ner, deprel, subj_pos, obj_pos = inputs # unpack 150 seq_lens = list(masks.data.eq(constant.PAD_ID).long().sum(1).squeeze()) 151 batch_size = words.size()[0] 152 153 # embedding lookup 154 word_inputs = self.emb(words) #word_inputs.shape = [b,maxlen] -> [b,maxlen,300] 155 inputs = [word_inputs] 156 if self.opt['pos_dim'] > 0: 157 inputs += [self.pos_emb(pos)] 158 if self.opt['ner_dim'] > 0: 159 inputs += [self.ner_emb(ner)] 160 inputs = self.drop(torch.cat(inputs, dim=2)) # add dropout to input #[b,maxlen,300+30+30] 161 input_size = inputs.size(2) 162 163 # rnn 164 h0, c0 = self.zero_state(batch_size) 165 inputs = nn.utils.rnn.pack_padded_sequence(inputs, seq_lens, batch_first=True) 166 outputs, (ht, ct) = self.rnn(inputs, (h0, c0)) #putput.shape = [b,maxlen,200] 167 outputs, output_lens = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) 168 hidden = self.drop(ht[-1,:,:]) # get the outmost layer h_n #获得hn的最后一层的值用来dropout 169 outputs = self.drop(outputs) 170 171 # attention 172 if self.opt['attn']: 173 # convert all negative PE numbers to positive indices 174 # e.g., -2 -1 0 1 will be mapped to 98 99 100 101 175 subj_pe_inputs = self.pe_emb(subj_pos + constant.MAX_LEN) 176 obj_pe_inputs = self.pe_emb(obj_pos + constant.MAX_LEN) #我看很多文章此类特征均都转为正数处理 177 pe_features = torch.cat((subj_pe_inputs, obj_pe_inputs), dim=2) #pe_features.shape = [b,maxlen,30+30] 178 final_hidden = self.attn_layer(outputs, masks, hidden, pe_features) 179 else: 180 final_hidden = hidden #final_hidden.shape = [b,200] 181 182 logits = self.linear(final_hidden) #[1,b,200]->[b,42] 183 return logits, final_hidden
从第178行代码可以看到将,双层单向LSTM的输出outputs,mask矩阵,LSTM的$h_n[-1]$,以及处理过的位置向量pre_features输入到Attention层中。
1 import torch 2 a = [torch.Tensor([1,2,3]),torch.Tensor([1,2,3]),torch.Tensor([1,2,3])] 3 print(sum(a)) #tensor([3., 6., 9.])
补充一个sum()实例,便于下面Attention层sum()操作的理解:
1 class PositionAwareAttention(nn.Module): 2 """ 3 A position-augmented attention layer where the attention weight is 4 a = T' . tanh(Ux + Vq + Wf) 5 where x is the input, q is the query, and f is additional position features. 6 """ 7 8 def __init__(self, input_size, query_size, feature_size, attn_size): 9 super(PositionAwareAttention, self).__init__() 10 self.input_size = input_size #input_size = hidden = 200 11 self.query_size = query_size #input_size = hidden = 200 12 self.feature_size = feature_size #feature_size = 2*30 = 60 13 self.attn_size = attn_size #attn_size = attn_dim =200 14 self.ulinear = nn.Linear(input_size, attn_size) #matrix U.shape = [200,200] 15 self.vlinear = nn.Linear(query_size, attn_size, bias=False) #matrix V.shape = [200,200] 16 if feature_size > 0: 17 self.wlinear = nn.Linear(feature_size, attn_size, bias=False) #matrix W.shape = [60,200] 18 else: 19 self.wlinear = None 20 self.tlinear = nn.Linear(attn_size, 1) 21 self.init_weights() 22 23 def init_weights(self): 24 self.ulinear.weight.data.normal_(std=0.001) 25 self.vlinear.weight.data.normal_(std=0.001) 26 if self.wlinear is not None: 27 self.wlinear.weight.data.normal_(std=0.001) 28 self.tlinear.weight.data.zero_() # use zero to give uniform attention at the beginning 29 #在开始时用零来给予每个位置相同的attention权重 30 31 def forward(self, x, x_mask, q, f): 32 """ 33 x : batch_size * seq_len * input_size 34 q : batch_size * query_size 35 f : batch_size * seq_len * feature_size 36 """ 37 batch_size, seq_len, _ = x.size() 38 39 x_proj = self.ulinear(x.contiguous().view(-1, self.input_size)).view( 40 batch_size, seq_len, self.attn_size) 41 # x.shape:[b,maxlen,200]->[b*maxlen,200]->[b*maxlen,200]->[b,maxlen,200] 此过程相当于Ux 42 q_proj = self.vlinear(q.view(-1, self.query_size)).contiguous().view( 43 batch_size, self.attn_size).unsqueeze(1).expand( 44 batch_size, seq_len, self.attn_size) 45 #q.shape:[b,200]->[b,200]->[b,200]->[b,200]->[b,1,200]->[b,maxlen,200] 此过程相当于 Vq 46 #[b,1,200]->[b,maxlen,200]将代表整句话的概要向量summary vector q,复制maxlen,让每个位置单词都能用上 47 if self.wlinear is not None: 48 f_proj = self.wlinear(f.view(-1, self.feature_size)).contiguous().view( 49 batch_size, seq_len, self.attn_size) 50 # f.shape:[b,maxlen,60]->[b*maxlen,60]->[b*maxlen,200]->[b,maxlen,200] 此过程相当于Wf 51 #[b*maxlen,60]->[b*maxlen,200]相当于把所有batch的单词连在一起与矩阵相乘,得到结果再改变shape 52 53 projs = [x_proj, q_proj, f_proj] 54 else: 55 projs = [x_proj, q_proj] 56 scores = self.tlinear(torch.tanh(sum(projs)).view(-1, self.attn_size)).view( 57 batch_size, seq_len) 58 ''' 59 sum(projs)三个矩阵对应位置数值相加 shape:[b,maxlen,200],调整为[b*maxlen,200] 60 经过一个激活函数tanh,再乘以一个矩阵T(tlinear),得到[b*maxlen,1] 61 最后输出为scores.shape:[b,maxlen],意为每个batch中每个单词位置的attention权重值 62 ''' 63 64 # mask padding 65 scores.data.masked_fill_(x_mask.data, -float('inf')) #无单词的位置替换为-inf,不分配注意力权重 66 weights = F.softmax(scores, dim=1) #weights.shape:[b,maxlen] 67 # weighted average input vectors 68 outputs = weights.unsqueeze(1).bmm(x).squeeze(1) 69 #weights.shape:[b,maxlen]->[b,1,maxlen] 70 #attention矩阵再与输入矩阵x相乘[b,1,maxlen]*[b,maxlen,200] = [b,1,200]->[b,200] 71 # z=\sum_{i=1}^n a_ih_i 这里相当于还原了公式得到向量z maxlen每个单词位置上的权重*每个单词位置上的hidden 72 return outputs
浙公网安备 33010602011771号