【代码精读】Relation classification via multi-level attention CNNs
SemEval-2010 Task-8
官方文档:https://docs.google.com/document/d/1QO_CnmvNRnYwNWu1-QCAeR5ToQYkXUqFeAJbdEhsq7w/preview
数据集:https://github.com/CrazilyCode/SemEval2010-Task8
There are 8000 sentences in train set and 2717 sentences in test set.
train
- FULL_TRAIN.txt
- train.txt
- train_result.txt
- train_result_full.txt
e.g.
FULL_TRAIN.txt
73 "The fire inside WTC was caused by exploding fuel."
Cause-Effect(e2,e1)
Comment:
train.txt
73 The fire inside WTC was caused by exploding fuel
train_result.txt
73 Cause-Effect
train_result_full.txt
73 Cause-Effect(e2,e1)
test
- FULL_TEST.txt
- test.txt
- test_result.txt
- test_result_full.txt
scorer
two tools for SemEval-2010 Task #8
- official output file format checker : semeval2010_task8_format_checker.pl
- official scorer for SemEval-2010 Task #8 : semeval2010_task8_scorer-v1.2.pl
论文中实验参数设置:
1 class DefaultConfig(object): 2 DP = 25 #Word_pos(position) 3 DC = 500 #Conv.size 4 N = 123 #sentence maxlen 5 NP = 123 #the number of relative distance 6 NR = 19 #relation class 18+1 7 KP = 0.6 #dropout 8 K = 3 #word window size 9 LR = 0.03 #learning rate 10 BATCH_SIZE = 32 11 epochs = 100 12 use_gpu = True 13 14 opt = DefaultConfig()
前期数据准备工作:
1 all_y = convert_to_one_hot([i for i in range(opt.NR)], opt.NR) 2 #all_y = to_categorical([i for i in range(opt.NR)], opt.NR) #将19个关系类别转为one-hot向量 3 all_y = torch.from_numpy(all_y.astype(np.float)).float().to(device) 4 train_data = load_data('attention/train.txt', opt.NR) 5 ''' 6 load_data返回的内容sentences, relations, e1_pos, e2_pos , 7 sentence:[[第一句话],[第二句话],..,],relations is numpy,shape:[句子数量,19] 8 e1_pos:[(start_pos, end_pos),(start_pos, end_pos),...] 9 ''' 10 eval_data = load_data('attention/test.txt', opt.NR) 11 word_dict = build_dict(train_data[0]) #按单词出现频率生成的字典 12 13 train_dataloader = gen_dataloader(train_data, word_dict, opt) 14 eval_dataloader = gen_dataloader(eval_data, word_dict, opt) 15 16 embed_file = 'attention/embeddings.txt' 17 vac_file = 'attention/words.lst' 18 embedding = load_embedding(embed_file, vac_file, word_dict) #embedding.shaoe = [num_words, dim]
第1行代码首先将19个关系类别转为one-hot向量表示:
1 def convert_to_one_hot(labels, num_classes): 2 #计算向量有多少行 3 num_labels = len(labels) 4 #生成值全为0的独热编码的矩阵 5 labels_one_hot = np.zeros((num_labels, num_classes)) 6 #计算向量中每个类别值在最终生成的矩阵“压扁”后的向量里的位置 7 index_offset = np.arange(num_labels) * num_classes 8 #遍历矩阵,为每个类别的位置填充1 9 labels_one_hot.flat[index_offset + labels] = 1 10 return labels_one_hot
第4行代码从数据集中加载相关数据:
数据详细格式:
1 1 1 9 9 The child was carefully wrapped and bound into the cradle by means of a cord .
1 def load_data(file, NR): 2 sentences = [] 3 relations = [] 4 e1_pos = [] 5 e2_pos = [] 6 7 with open(file, 'r', encoding='utf-8', errors='ignore') as f: 8 for line in f.readlines(): 9 line = line.strip().lower().split() #strip()去除头尾的" "负号,spilt()以空格为分隔符返回单词的list 10 relations.append(int(line[0])) 11 e1_pos.append((int(line[1]), int(line[2]))) # (start_pos, end_pos) 12 e2_pos.append((int(line[3]), int(line[4]))) # (start_pos, end_pos) 13 sentences.append(line[5:]) 14 #relations = to_categorical(relations, NR) 15 relations = convert_to_one_hot(relations,NR) #relations.shape = [数据行数,19] 16 return sentences, relations, e1_pos, e2_pos
执行第11行代码,获得在训练数据集中出现过的单词的字典,并按照出现频率排序,每个单词赋予一个id:
1 def build_dict(sentences): 2 word_count = Counter() 3 for sent in sentences: 4 for w in sent: 5 word_count[w] += 1 6 7 ls = word_count.most_common() #按单词出现频率的降序输出: [('a', 5), ('b', 4), ('c', 3)] 8 word_dict = {w[0]: index + 1 for (index, w) in enumerate(ls)} 9 # leave 0 to PAD 10 return word_dict
第13,14行代码将数据从list->numpy->torch,打包成dataset.TensorDataset数据集,通过Dataloader( )返回,便于我们训练模型:
1 def gen_dataloader(data, word_dict, arg): 2 tp = vectorize(data, word_dict, arg.N) 3 x, y, e1, e2, e1d2, e2d1, zd, d1, d2 = tp 4 ''' 5 sents_vec, relations, e1_vec, e2_vec, e1d2, e2d1, zd, d1, d2 6 x = sents_vec = [[1,2,3...],[3,4,6,..],...]; 7 y = relations is a numpy 8 e1 = e1_vec = [3,5,7,8...]每句话的e1最后一个单词 9 e1d2 = [0,67,122,...]每句话e1与e2的相对距离 10 zd = [0,0,0,...] len(sentences)长度个0 11 d1 = [[1,0,-1,-2,-3,..],[0,-1,-2,..],...]每句话每个单词距e1的相对距离 12 ''' 13 y_t = torch.LongTensor(np.array(y).astype(np.int64)) #y_t = [len(sentences),19] = [8000,19] 14 zd = np.array(zd).reshape(-1, 1) 15 e1, e1d2, d1 = np.array(e1).reshape(-1, 1), np.array(e1d2).reshape(-1, 1), np.array(d1) 16 e2, e2d1, d2 = np.array(e2).reshape(-1, 1), np.array(e2d1).reshape(-1, 1), np.array(d2) 17 np_cat = np.concatenate((x, e1, e1d2, e2, e2d1, zd, d1, d2), axis=1) 18 #执行跨列操作,shape:[len(sentences),maxlen+1+1+1+1+1+123+123] = [8000,374] 19 d_t = torch.from_numpy(np_cat.astype(np.int64)) 20 ds = dataset.TensorDataset(d_t, y_t) #d_t = [8000,374] y_t = [8000,19] 21 return DataLoader(ds, arg.BATCH_SIZE, shuffle=True)
在此函数内部第2行我们的数据进行了向量化:
1 def pos(x): 2 ''' 3 map the relative distance between [0, 123) 4 e1,e2离得过远返回0或者122 5 默认距离如果超过60返回极端值表示这里的单词与e1关系不大 6 ''' 7 if x < -60: 8 return 0 9 if 60 >= x >= -60: 10 return x + 61 11 if x > 60: 12 return 122 13 14 15 def vectorize(data, word_dict, max_len): 16 sentences, relations, e1_pos, e2_pos = data 17 # replace word with word-id 18 d1, d2, e1d2, e2d1 = [], [], [], [] 19 e1_vec, e2_vec = [], [] 20 num_data = len(sentences) 21 zd = [0 for _ in range(num_data)] 22 sents_vec = np.zeros((num_data, max_len), dtype=int) 23 logging.debug('data shape: (%d, %d)' % (num_data, max_len)) 24 25 for idx, (sent, pos1, pos2) in enumerate(zip(sentences, e1_pos, e2_pos)): 26 vec = [word_dict[w] if w in word_dict else 0 for w in sent] 27 sents_vec[idx, :len(vec)] = vec #sents_vec.shape:[句子条目,句子长度] 28 # last word of e1 and e2 29 e1_vec.append(vec[pos1[1]]) 30 e2_vec.append(vec[pos2[1]]) #这里只记录最后一个实体单词,是否遗失了部分实体信息内容 31 32 # compute relative distance 33 for sent, p1, p2 in zip(sents_vec, e1_pos, e2_pos): 34 # current word position - last word position of e1 or e2 35 e1d2.append(pos(p1[1] - p2[1])) #e1与e2的相对距离 36 e2d1.append(pos(p2[1] - p1[1])) 37 d1.append([pos(p1[1] - idx) for idx, _ in enumerate(sent)]) #每个单词与e1的相对距离 38 d2.append([pos(p2[1] - idx) for idx, _ in enumerate(sent)]) 39 40 return sents_vec, relations, e1_vec, e2_vec, e1d2, e2d1, zd, d1, d2
紧接着在第18行代码对于在训练数据集中出现过的单词使用预训练好的Word2Vec词向量对单词进行embedding:
1 def load_embedding(emb_file, emb_vocab, word_dict): 2 vocab = {} 3 with open(emb_vocab, 'r') as f: 4 for id, w in enumerate(f.readlines()): 5 w = w.strip().lower() 6 vocab[w] = id 7 8 f = open(emb_file, 'r') 9 embed = f.readlines() 10 11 dim = len(embed[0].split()) 12 num_words = len(word_dict) + 1 13 embeddings = np.random.uniform(-0.01, 0.01, size=(num_words, dim)) 14 15 pre_trained = 0 16 for w in vocab.keys(): 17 if w in word_dict: 18 embeddings[word_dict[w]] = [float(x) for x in embed[vocab[w]].split()] 19 pre_trained += 1 20 embeddings[0] = np.zeros(dim) #第一个单词padding = 0 21 22 logging.info( 23 'embeddings: %.2f%%(pre_trained) unknown: %d' % (pre_trained / num_words * 100, num_words - pre_trained)) 24 25 f.close() 26 return embeddings.astype(np.float32)
后期工作对模型进行训练:
1 model = pa.ACNN(opt, embedding).to(device) 2 optimizer = torch.optim.Adam(model.parameters(), lr=opt.LR, weight_decay=0.0001) # optimize all rnn parameters 3 loss_func = pa.DistanceLoss(opt.NR) 4 5 for i in range(opt.epochs): 6 acc, loss = model_run(opt, train_dataloader, loss_func, model, all_y, optimizer) 7 val_acc, val_loss = model_run(opt, eval_dataloader, loss_func, model, all_y) 8 print('epoch: %d, t_l: %.2f, t_a: %.2f%%, v_l: %.2f, v_a: %.2f%%' % (i, loss, acc, val_loss, val_acc))
第1行初始化ACNN模型:
1 class ACNN(nn.Module): 2 def __init__(self, opt, embedding): 3 super(ACNN, self).__init__() 4 self.opt = opt 5 self.dw = embedding.shape[1] 6 self.vac_len = embedding.shape[0] 7 self.d = self.dw + 2 * self.opt.DP 8 self.p = (self.opt.K - 1) // 2 #padding 9 self.x_embedding = nn.Embedding(self.vac_len, self.dw) 10 self.x_embedding.weight = nn.Parameter(torch.from_numpy(embedding)) 11 # self.e1_embedding = nn.Embedding(self.vac_len, self.dw) 12 # self.e1_embedding.weight = nn.Parameter(torch.from_numpy(embedding)) 13 # self.e2_embedding = nn.Embedding(self.vac_len, self.dw) 14 # self.e2_embedding.weight = nn.Parameter(torch.from_numpy(embedding)) 15 self.dist_embedding = nn.Embedding(self.opt.NP, self.opt.DP) 16 self.rel_weight = nn.Parameter(torch.randn(self.opt.NR, self.opt.DC)) 17 self.dropout = nn.Dropout(self.opt.KP) 18 self.conv = nn.Conv2d(1, self.opt.DC, (self.opt.K, self.d), (1, self.d), (self.p, 0), bias=True) 19 self.U = nn.Parameter(torch.randn(self.opt.DC, self.opt.NR)) 20 self.max_pool = nn.MaxPool1d(self.opt.N, stride=1) 21 22 def input_attention(self, input_tuple, is_training=True): 23 x, e1, e1d2, e2, e2d1, zd, d1, d2 = input_tuple 24 x_emb = self.x_embedding(x) # (bs, len(sentence) = n, dw) 25 e1_emb = self.x_embedding(e1) #(bs,1,dw) 26 e2_emb = self.x_embedding(e2) 27 # zd_emb = self.dist_embedding(zd) 28 # e1d2_emb = self.dist_embedding(e1d2) 29 # e2d1_emb = self.dist_embedding(e2d1) 30 dist1_emb = self.dist_embedding(d1) #(bs, len(sentence) = n, dp) 31 dist2_emb = self.dist_embedding(d2) 32 x_cat = torch.cat((x_emb, dist1_emb, dist2_emb), 2) #(bs,n,dw+2dp) 33 # e1_cat = torch.cat((e1_emb, zd_emb, e1d2_emb), 2) 34 # e2_cat = torch.cat((e2_emb, e2d1_emb, zd_emb), 2) 35 # if is_training: 36 # x_cat = self.dropout(x_cat) 37 ine1_aw = F.softmax(torch.bmm(x_emb, e1_emb.transpose(2, 1)), 1) # (bs, n, 1) 38 ''' 39 attention:每个句子n个位置单词都与e1_emb计算过了,再进行softmax 40 ''' 41 ine2_aw = F.softmax(torch.bmm(x_emb, e2_emb.transpose(2, 1)), 1) 42 # ine1_aw = F.softmax(torch.bmm(x_cat, e1_cat.transpose(2, 1)), 1) # (bs, n, 1) 43 # ine2_aw = F.softmax(torch.bmm(x_cat, e2_cat.transpose(2, 1)), 1) 44 in_aw = (ine1_aw + ine2_aw) / 2 45 R = torch.mul(x_cat, in_aw) 46 ''' 47 torch.mul(a, b)矩阵a和b对应位相乘,广播机制 48 (bs,n,dw+2dp)*(bs,n,1)->对应位相乘(bs,n,1)自动拓展为(bs,n,dw+2dp)->(bs,n,dw+2dp) 49 每个横向的embedding(dw+2dp)都被attention权重放缩 50 ''' 51 return R 52 53 # def attentive_pooling(self, R_star, all_y): 54 # rel_emb = torch.mm(all_y, self.rel_weight) # (NR, NR) * (NR, DC) 55 # RU = torch.matmul(R_star.transpose(2, 1), self.U) # (bs, n, nr) 56 # G = torch.matmul(RU, rel_emb) # (bs, n, dc) 57 # AP = F.softmax(G, dim=1) 58 # RA = torch.mul(R_star, AP.transpose(2, 1)) 59 # wo = self.max_pool(RA).squeeze(-1) 60 # return wo, self.rel_weight 61 62 def attentive_pooling(self, R_star): 63 RU = torch.matmul(R_star.transpose(2, 1), self.U) # (bs, n, nr) 64 ''' 65 R_star.transpose(2,1) : (bs,n,500) 66 (bs,n,500)*(500,19) = (bs,n,19) 67 ''' 68 G = torch.matmul(RU, self.rel_weight) # (bs, n, dc) 69 ''' 70 (bs,n,19)*(bs,19,500)->(bs,n,500) 71 ''' 72 AP = F.softmax(G, dim=1) #对每一个维度上的n的单词特征做softmax,对应论文中的公式 73 RA = torch.mul(R_star, AP.transpose(2, 1)) 74 #(bs,500,n)*(bs,500,n)->点乘(bs,500,n) 75 wo = self.max_pool(RA).squeeze(-1) #(bs,500,n)->(bs,500,1)->(bs,500) 得到500个特征维度上最俱代表的n的值 76 return wo 77 78 def forward(self, input_tuple, is_training=True): 79 R = self.input_attention(input_tuple, is_training) 80 R_star = self.conv(R.unsqueeze(1)).squeeze(-1) # (bs, dc, n) 81 ''' 82 这里对应卷积层滑窗得到的z_i向量 83 R.unsqueeze(1) :(bs,1,n,dw+2dp) 84 self.conv(R.unsqueeze(1))通过卷积层后得到(bs,500,n,1) 85 调用squeeze(-1) :(bs,500,n) 86 现在每个batch有500的角度(filter)的表示一个单词上下文vector 87 ''' 88 R_star = torch.tanh(R_star) 89 wo = self.attentive_pooling(R_star) 90 return wo, self.rel_weight
第三行代码自定义的新的Loss:
1 class DistanceLoss(nn.Module): 2 def __init__(self, nr, margin=1): 3 super(DistanceLoss, self).__init__() 4 self.nr = nr 5 self.margin = margin 6 7 def forward(self, wo, rel_weight, in_y, all_y): 8 ''' 9 :param wo: [bs,500] 10 :param rel_weight: [19,500] 11 :param in_y: [bs,19] 12 :param all_y: [19,19] 13 :return: 14 ''' 15 wo_norm = F.normalize(wo) # (bs, dc) in_y (bs, nr) 16 wo_norm_tile = wo_norm.unsqueeze(1).repeat(1, in_y.size()[-1], 1) # (bs, nr, dc) 17 rel_emb = torch.mm(in_y, rel_weight) # (bs, dc) 18 ay_emb = torch.mm(all_y, rel_weight) # (nr, dc) 19 gt_dist = torch.norm(wo_norm - rel_emb, 2, 1) # (bs, 1) 输出wo与真实标签的距离 20 all_dist = torch.norm(wo_norm_tile - ay_emb, 2, 2) # (bs, nr) 输出wo与所有标签的距离 21 masking_y = torch.mul(in_y, 10000) #(bs,19) 22 _t_dist = torch.min(torch.add(all_dist, masking_y), 1)[0] #增大正确值位置的数值便于mask shape:[bs,1] 23 #!!!这里很有意思,所有错误类得分最高的一个。除了真实值之外,所有错误标签越小的,实际上越接近真实值,也就是得分最高的一个 24 loss = torch.mean(self.margin + gt_dist - _t_dist) 25 return loss
第6,7行代码分别进行训练和测试:
1 def model_run(opt, dataloader, loss_func, model, all_y, optimizer=None): 2 '''all_y 代表19个种类的one-hot向量''' 3 acc, loss = 0, 0 4 for i, (bx_cat, by) in enumerate(dataloader): #d_t = [bs,374] y_t = [bs,19] 5 by = by.float().to(device) 6 bin_tup = data_unpack(bx_cat, opt.N) #return x, e1, e1d2, e2, e2d1, zd, d1, d2 7 # wo, rel_weight = model(bin_tup, all_y) 8 wo, rel_weight = model(bin_tup) ##wo = [bs,500], self.rel_weight = [19,500] 9 a = prediction(wo, rel_weight, by, all_y) #返回正确率acc 10 l = loss_func(wo, rel_weight, by, all_y) 11 # a = prediction(wo, rel_weight, by) 12 # l = loss_func(wo, rel_weight, by) 13 # print('%.2f%%, %.2f' % (a.cpu().data.numpy() * 100, l.detach().cpu().numpy())) 14 acc += a.cpu().data.numpy() * 100 15 loss += l.detach().cpu().numpy() 16 if optimizer is not None: 17 l.backward(), optimizer.step(), optimizer.zero_grad() 18 return acc / i, loss / i 19 20 def data_unpack(cat_data, N): 21 list_x = np.split(cat_data.numpy(), [N, N + 1, N + 2, N + 3, N + 4, N + 5, 2 * N + 5], 1) 22 x = torch.from_numpy(list_x[0]).to(device) 23 e1 = torch.from_numpy(list_x[1]).to(device) 24 e1d2 = torch.from_numpy(list_x[2]).to(device) 25 e2 = torch.from_numpy(list_x[3]).to(device) 26 e2d1 = torch.from_numpy(list_x[4]).to(device) 27 zd = torch.from_numpy(list_x[5]).to(device) 28 d1 = torch.from_numpy(list_x[6]).to(device) 29 d2 = torch.from_numpy(list_x[7]).to(device) 30 return x, e1, e1d2, e2, e2d1, zd, d1, d2 31 32 def prediction(wo, rel_weight, y, all_y): 33 ''' 34 :param wo: [bs,500] 35 :param rel_weight: [19,500] 36 :param y: [bs,19] 37 :param all_y: [19,19] 38 :return: 39 ''' 40 wo_norm = F.normalize(wo) # default: dim=1 (bs, dc) 41 wo_norm_tile = wo_norm.unsqueeze(1).repeat(1, all_y.size()[0], 1) # (bs, nr, dc) 42 #(bs,500)->(bs,1,500)->(bs,19,500) 43 ay_emb = torch.mm(all_y, rel_weight) # (nr, dc) 这一步有点难理解在原文公式中对应着W_y^L 44 #(19,19)*(19*500)->(19,500) 45 dist = torch.norm(wo_norm_tile - ay_emb, 2, 2) # (bs, nr) torch.norm二范数运算 46 predict = torch.min(dist, 1)[1].long() #在一行里选一个最小的nr下标返回 \delta_\theta(S,y) shape:[bs] 47 y = torch.max(y, 1)[1] #真实值的下标 #shape:[bs] 48 correct = torch.eq(predict, y) #shape:[bs] 49 return correct.sum().float() / float(correct.data.size()[0])
实验跑了一下,效果就差的离谱,猜测应该是我的convert_to_onehot函数没有通过梯度下降更新。model_run()部分的第15行l.detach().cpu().numpy()阻断反向传播只是为了取出loss值来,不会影响梯度下降更新参数过程,
参考:
torch.nn.functional.normalize:https://blog.csdn.net/ECNU_LZJ/article/details/103653133?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-1.not_use_machine_learn_pai&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-1.not_use_machine_learn_pai
维度dim = 0的理解: https://mathpretty.com/12065.html
torch.norm()用法:https://blog.csdn.net/qq_36556893/article/details/90698186
浙公网安备 33010602011771号