【代码精读】Relation classification via multi-level attention CNNs

SemEval-2010 Task-8

官方文档：https://docs.google.com/document/d/1QO_CnmvNRnYwNWu1-QCAeR5ToQYkXUqFeAJbdEhsq7w/preview

数据集：https://github.com/CrazilyCode/SemEval2010-Task8

There are 8000 sentences in train set and 2717 sentences in test set.

train

FULL_TRAIN.txt
train.txt
train_result.txt
train_result_full.txt

e.g.
FULL_TRAIN.txt
73 "The fire inside WTC was caused by exploding fuel."
Cause-Effect(e2,e1)
Comment:
train.txt
73 The fire inside WTC was caused by exploding fuel
train_result.txt
73 Cause-Effect
train_result_full.txt
73 Cause-Effect(e2,e1)

test

FULL_TEST.txt
test.txt
test_result.txt
test_result_full.txt

scorer

two tools for SemEval-2010 Task #8

official output file format checker : semeval2010_task8_format_checker.pl
official scorer for SemEval-2010 Task #8 : semeval2010_task8_scorer-v1.2.pl

论文中实验参数设置：

 1 class DefaultConfig(object):
 2     DP = 25    #Word_pos(position)
 3     DC = 500    #Conv.size
 4     N = 123     #sentence maxlen
 5     NP = 123    #the number of relative distance
 6     NR = 19     #relation class 18+1
 7     KP = 0.6    #dropout 
 8     K = 3       #word window size
 9     LR = 0.03   #learning rate
10     BATCH_SIZE = 32
11     epochs = 100
12     use_gpu = True
13 
14 opt = DefaultConfig()

前期数据准备工作：

 1 all_y = convert_to_one_hot([i for i in range(opt.NR)], opt.NR)
 2 #all_y = to_categorical([i for i in range(opt.NR)], opt.NR)   #将19个关系类别转为one-hot向量
 3 all_y = torch.from_numpy(all_y.astype(np.float)).float().to(device)
 4 train_data = load_data('attention/train.txt', opt.NR)
 5 '''
 6 load_data返回的内容sentences, relations, e1_pos, e2_pos ，
 7 sentence：[[第一句话]，[第二句话],..,],relations is numpy，shape:[句子数量,19]
 8 e1_pos:[(start_pos, end_pos)，(start_pos, end_pos)，...]
 9 '''
10 eval_data = load_data('attention/test.txt', opt.NR)
11 word_dict = build_dict(train_data[0])  #按单词出现频率生成的字典
12 
13 train_dataloader = gen_dataloader(train_data, word_dict, opt)
14 eval_dataloader = gen_dataloader(eval_data, word_dict, opt)
15 
16 embed_file = 'attention/embeddings.txt'
17 vac_file = 'attention/words.lst'
18 embedding = load_embedding(embed_file, vac_file, word_dict)  #embedding.shaoe = [num_words, dim]

第1行代码首先将19个关系类别转为one-hot向量表示：

 1 def convert_to_one_hot(labels, num_classes):
 2     #计算向量有多少行
 3     num_labels = len(labels)
 4     #生成值全为0的独热编码的矩阵
 5     labels_one_hot = np.zeros((num_labels, num_classes))
 6     #计算向量中每个类别值在最终生成的矩阵“压扁”后的向量里的位置
 7     index_offset = np.arange(num_labels) * num_classes
 8     #遍历矩阵，为每个类别的位置填充1
 9     labels_one_hot.flat[index_offset + labels] = 1
10     return labels_one_hot

第4行代码从数据集中加载相关数据:

数据详细格式：

1 1 1 9 9 The child was carefully wrapped and bound into the cradle by means of a cord .

 1 def load_data(file, NR):
 2     sentences = []
 3     relations = []
 4     e1_pos = []
 5     e2_pos = []
 6 
 7     with open(file, 'r', encoding='utf-8', errors='ignore') as f:
 8         for line in f.readlines():
 9             line = line.strip().lower().split() #strip()去除头尾的" "负号，spilt()以空格为分隔符返回单词的list
10             relations.append(int(line[0]))
11             e1_pos.append((int(line[1]), int(line[2])))  # (start_pos, end_pos)
12             e2_pos.append((int(line[3]), int(line[4])))  # (start_pos, end_pos)
13             sentences.append(line[5:])
14     #relations = to_categorical(relations, NR)
15     relations = convert_to_one_hot(relations,NR)    #relations.shape = [数据行数,19]
16     return sentences, relations, e1_pos, e2_pos

执行第11行代码，获得在训练数据集中出现过的单词的字典，并按照出现频率排序，每个单词赋予一个id：

 1 def build_dict(sentences):
 2     word_count = Counter()
 3     for sent in sentences:
 4         for w in sent:
 5             word_count[w] += 1
 6 
 7     ls = word_count.most_common() #按单词出现频率的降序输出： [('a', 5), ('b', 4), ('c', 3)]
 8     word_dict = {w[0]: index + 1 for (index, w) in enumerate(ls)}
 9     # leave 0 to PAD
10     return word_dict

第13，14行代码将数据从list->numpy->torch，打包成dataset.TensorDataset数据集,通过Dataloader( )返回，便于我们训练模型：

 1 def gen_dataloader(data, word_dict, arg):
 2     tp = vectorize(data, word_dict, arg.N)
 3     x, y, e1, e2, e1d2, e2d1, zd, d1, d2 = tp
 4     '''
 5     sents_vec, relations, e1_vec, e2_vec, e1d2, e2d1, zd, d1, d2
 6     x = sents_vec = [[1,2,3...],[3,4,6,..],...];
 7     y = relations is a numpy
 8     e1 = e1_vec = [3,5,7,8...]每句话的e1最后一个单词
 9     e1d2 = [0,67,122,...]每句话e1与e2的相对距离
10     zd = [0,0,0,...] len(sentences)长度个0
11     d1 = [[1,0,-1,-2,-3,..],[0,-1,-2,..],...]每句话每个单词距e1的相对距离
12     '''
13     y_t = torch.LongTensor(np.array(y).astype(np.int64))   #y_t = [len(sentences),19] = [8000,19]
14     zd = np.array(zd).reshape(-1, 1)
15     e1, e1d2, d1 = np.array(e1).reshape(-1, 1), np.array(e1d2).reshape(-1, 1), np.array(d1)
16     e2, e2d1, d2 = np.array(e2).reshape(-1, 1), np.array(e2d1).reshape(-1, 1), np.array(d2)
17     np_cat = np.concatenate((x, e1, e1d2, e2, e2d1, zd, d1, d2), axis=1)
18     #执行跨列操作,shape:[len(sentences),maxlen+1+1+1+1+1+123+123] = [8000,374]
19     d_t = torch.from_numpy(np_cat.astype(np.int64))
20     ds = dataset.TensorDataset(d_t, y_t)    #d_t = [8000,374]  y_t = [8000,19]
21     return DataLoader(ds, arg.BATCH_SIZE, shuffle=True)

在此函数内部第2行我们的数据进行了向量化：

 1 def pos(x):
 2     '''
 3     map the relative distance between [0, 123)
 4     e1,e2离得过远返回0或者122
 5     默认距离如果超过60返回极端值表示这里的单词与e1关系不大
 6     '''
 7     if x < -60:
 8         return 0
 9     if 60 >= x >= -60:
10         return x + 61
11     if x > 60:
12         return 122
13 
14 
15 def vectorize(data, word_dict, max_len):
16     sentences, relations, e1_pos, e2_pos = data
17     # replace word with word-id
18     d1, d2, e1d2, e2d1 = [], [], [], []
19     e1_vec, e2_vec = [], []
20     num_data = len(sentences)
21     zd = [0 for _ in range(num_data)]
22     sents_vec = np.zeros((num_data, max_len), dtype=int)
23     logging.debug('data shape: (%d, %d)' % (num_data, max_len))
24 
25     for idx, (sent, pos1, pos2) in enumerate(zip(sentences, e1_pos, e2_pos)):
26         vec = [word_dict[w] if w in word_dict else 0 for w in sent]
27         sents_vec[idx, :len(vec)] = vec     #sents_vec.shape:[句子条目，句子长度]
28         # last word of e1 and e2
29         e1_vec.append(vec[pos1[1]])
30         e2_vec.append(vec[pos2[1]])         #这里只记录最后一个实体单词，是否遗失了部分实体信息内容
31 
32     # compute relative distance
33     for sent, p1, p2 in zip(sents_vec, e1_pos, e2_pos):
34         # current word position - last word position of e1 or e2
35         e1d2.append(pos(p1[1] - p2[1]))     #e1与e2的相对距离
36         e2d1.append(pos(p2[1] - p1[1]))
37         d1.append([pos(p1[1] - idx) for idx, _ in enumerate(sent)])  #每个单词与e1的相对距离
38         d2.append([pos(p2[1] - idx) for idx, _ in enumerate(sent)])
39 
40     return sents_vec, relations, e1_vec, e2_vec, e1d2, e2d1, zd, d1, d2

紧接着在第18行代码对于在训练数据集中出现过的单词使用预训练好的Word2Vec词向量对单词进行embedding：

 1 def load_embedding(emb_file, emb_vocab, word_dict):
 2     vocab = {}
 3     with open(emb_vocab, 'r') as f:
 4         for id, w in enumerate(f.readlines()):
 5             w = w.strip().lower()
 6             vocab[w] = id
 7 
 8     f = open(emb_file, 'r')
 9     embed = f.readlines()
10 
11     dim = len(embed[0].split())
12     num_words = len(word_dict) + 1
13     embeddings = np.random.uniform(-0.01, 0.01, size=(num_words, dim))
14 
15     pre_trained = 0
16     for w in vocab.keys():
17         if w in word_dict:
18             embeddings[word_dict[w]] = [float(x) for x in embed[vocab[w]].split()]
19             pre_trained += 1
20     embeddings[0] = np.zeros(dim)   #第一个单词padding = 0
21 
22     logging.info(
23         'embeddings: %.2f%%(pre_trained) unknown: %d' % (pre_trained / num_words * 100, num_words - pre_trained))
24 
25     f.close()
26     return embeddings.astype(np.float32)

后期工作对模型进行训练：

1 model = pa.ACNN(opt, embedding).to(device)
2 optimizer = torch.optim.Adam(model.parameters(), lr=opt.LR, weight_decay=0.0001)  # optimize all rnn parameters
3 loss_func = pa.DistanceLoss(opt.NR)
4 
5 for i in range(opt.epochs):
6     acc, loss = model_run(opt, train_dataloader, loss_func, model, all_y, optimizer)
7     val_acc, val_loss = model_run(opt, eval_dataloader, loss_func, model, all_y)
8     print('epoch: %d, t_l: %.2f, t_a: %.2f%%, v_l: %.2f, v_a: %.2f%%' % (i, loss, acc, val_loss, val_acc))

第1行初始化ACNN模型：

 1 class ACNN(nn.Module):
 2     def __init__(self, opt, embedding):
 3         super(ACNN, self).__init__()
 4         self.opt = opt
 5         self.dw = embedding.shape[1]
 6         self.vac_len = embedding.shape[0]
 7         self.d = self.dw + 2 * self.opt.DP
 8         self.p = (self.opt.K - 1) // 2      #padding
 9         self.x_embedding = nn.Embedding(self.vac_len, self.dw)
10         self.x_embedding.weight = nn.Parameter(torch.from_numpy(embedding))
11         # self.e1_embedding = nn.Embedding(self.vac_len, self.dw)
12         # self.e1_embedding.weight = nn.Parameter(torch.from_numpy(embedding))
13         # self.e2_embedding = nn.Embedding(self.vac_len, self.dw)
14         # self.e2_embedding.weight = nn.Parameter(torch.from_numpy(embedding))
15         self.dist_embedding = nn.Embedding(self.opt.NP, self.opt.DP)
16         self.rel_weight = nn.Parameter(torch.randn(self.opt.NR, self.opt.DC))
17         self.dropout = nn.Dropout(self.opt.KP)
18         self.conv = nn.Conv2d(1, self.opt.DC, (self.opt.K, self.d), (1, self.d), (self.p, 0), bias=True)
19         self.U = nn.Parameter(torch.randn(self.opt.DC, self.opt.NR))
20         self.max_pool = nn.MaxPool1d(self.opt.N, stride=1)
21 
22     def input_attention(self, input_tuple, is_training=True):
23         x, e1, e1d2, e2, e2d1, zd, d1, d2 = input_tuple
24         x_emb = self.x_embedding(x)  # (bs, len(sentence) = n, dw)
25         e1_emb = self.x_embedding(e1)   #(bs,1,dw)
26         e2_emb = self.x_embedding(e2)
27         # zd_emb = self.dist_embedding(zd)
28         # e1d2_emb = self.dist_embedding(e1d2)
29         # e2d1_emb = self.dist_embedding(e2d1)
30         dist1_emb = self.dist_embedding(d1)   #(bs, len(sentence) = n, dp)
31         dist2_emb = self.dist_embedding(d2)
32         x_cat = torch.cat((x_emb, dist1_emb, dist2_emb), 2) #(bs,n,dw+2dp)
33         # e1_cat = torch.cat((e1_emb, zd_emb, e1d2_emb), 2)
34         # e2_cat = torch.cat((e2_emb, e2d1_emb, zd_emb), 2)
35         # if is_training:
36         #     x_cat = self.dropout(x_cat)
37         ine1_aw = F.softmax(torch.bmm(x_emb, e1_emb.transpose(2, 1)), 1)  # (bs, n, 1)
38         '''
39         attention：每个句子n个位置单词都与e1_emb计算过了，再进行softmax
40         '''
41         ine2_aw = F.softmax(torch.bmm(x_emb, e2_emb.transpose(2, 1)), 1)
42         # ine1_aw = F.softmax(torch.bmm(x_cat, e1_cat.transpose(2, 1)), 1)  # (bs, n, 1)
43         # ine2_aw = F.softmax(torch.bmm(x_cat, e2_cat.transpose(2, 1)), 1)
44         in_aw = (ine1_aw + ine2_aw) / 2
45         R = torch.mul(x_cat, in_aw)
46         '''
47         torch.mul(a, b)矩阵a和b对应位相乘，广播机制
48         (bs,n,dw+2dp)*(bs,n,1)->对应位相乘(bs,n,1)自动拓展为(bs,n,dw+2dp)->(bs,n,dw+2dp)
49         每个横向的embedding(dw+2dp)都被attention权重放缩
50         '''
51         return R
52 
53     # def attentive_pooling(self, R_star, all_y):
54     #     rel_emb = torch.mm(all_y, self.rel_weight)  # (NR, NR) * (NR, DC)
55     #     RU = torch.matmul(R_star.transpose(2, 1), self.U)  # (bs, n, nr)
56     #     G = torch.matmul(RU, rel_emb)  # (bs, n, dc)
57     #     AP = F.softmax(G, dim=1)
58     #     RA = torch.mul(R_star, AP.transpose(2, 1))
59     #     wo = self.max_pool(RA).squeeze(-1)
60     #     return wo, self.rel_weight
61 
62     def attentive_pooling(self, R_star):
63         RU = torch.matmul(R_star.transpose(2, 1), self.U)  # (bs, n, nr)
64         '''
65         R_star.transpose(2,1) : (bs,n,500)
66         (bs,n,500)*(500,19) = (bs,n,19)
67         '''
68         G = torch.matmul(RU, self.rel_weight)  # (bs, n, dc)
69         '''
70         (bs,n,19)*(bs,19,500)->(bs,n,500)
71         '''
72         AP = F.softmax(G, dim=1)        #对每一个维度上的n的单词特征做softmax，对应论文中的公式
73         RA = torch.mul(R_star, AP.transpose(2, 1))
74         #(bs,500,n)*(bs,500,n)->点乘(bs,500,n)
75         wo = self.max_pool(RA).squeeze(-1)  #(bs,500,n)->(bs,500,1)->(bs,500) 得到500个特征维度上最俱代表的n的值
76         return wo
77 
78     def forward(self, input_tuple, is_training=True):
79         R = self.input_attention(input_tuple, is_training)
80         R_star = self.conv(R.unsqueeze(1)).squeeze(-1)  # (bs, dc, n)
81         '''
82         这里对应卷积层滑窗得到的z_i向量
83         R.unsqueeze(1) :(bs,1,n,dw+2dp)
84         self.conv(R.unsqueeze(1))通过卷积层后得到(bs,500,n,1)
85         调用squeeze(-1) :(bs,500,n)
86         现在每个batch有500的角度(filter)的表示一个单词上下文vector
87         '''
88         R_star = torch.tanh(R_star)
89         wo = self.attentive_pooling(R_star)
90         return wo, self.rel_weight

第三行代码自定义的新的Loss:

 1 class DistanceLoss(nn.Module):
 2     def __init__(self, nr, margin=1):
 3         super(DistanceLoss, self).__init__()
 4         self.nr = nr
 5         self.margin = margin
 6 
 7     def forward(self, wo, rel_weight, in_y, all_y):
 8         '''
 9         :param wo: [bs,500]
10         :param rel_weight: [19,500]
11         :param in_y: [bs,19]
12         :param all_y: [19,19]
13         :return:
14         '''
15         wo_norm = F.normalize(wo)  # (bs, dc)  in_y (bs, nr)
16         wo_norm_tile = wo_norm.unsqueeze(1).repeat(1, in_y.size()[-1], 1)  # (bs, nr, dc)
17         rel_emb = torch.mm(in_y, rel_weight)  # (bs, dc)
18         ay_emb = torch.mm(all_y, rel_weight)  # (nr, dc)
19         gt_dist = torch.norm(wo_norm - rel_emb, 2, 1)  # (bs, 1)  输出wo与真实标签的距离
20         all_dist = torch.norm(wo_norm_tile - ay_emb, 2, 2)  # (bs, nr)   输出wo与所有标签的距离
21         masking_y = torch.mul(in_y, 10000)    #(bs,19)
22         _t_dist = torch.min(torch.add(all_dist, masking_y), 1)[0]  #增大正确值位置的数值便于mask   shape:[bs,1]
23         #!!!这里很有意思，所有错误类得分最高的一个。除了真实值之外，所有错误标签越小的，实际上越接近真实值，也就是得分最高的一个
24         loss = torch.mean(self.margin + gt_dist - _t_dist)
25         return loss

第6，7行代码分别进行训练和测试：

 1 def model_run(opt, dataloader, loss_func, model, all_y, optimizer=None):
 2     '''all_y 代表19个种类的one-hot向量'''
 3     acc, loss = 0, 0
 4     for i, (bx_cat, by) in enumerate(dataloader):   #d_t = [bs,374]  y_t = [bs,19]
 5         by = by.float().to(device)
 6         bin_tup = data_unpack(bx_cat, opt.N)    #return x, e1, e1d2, e2, e2d1, zd, d1, d2
 7         # wo, rel_weight = model(bin_tup, all_y)
 8         wo, rel_weight = model(bin_tup)      ##wo = [bs,500], self.rel_weight = [19,500]
 9         a = prediction(wo, rel_weight, by, all_y)    #返回正确率acc
10         l = loss_func(wo, rel_weight, by, all_y)
11         # a = prediction(wo, rel_weight, by)
12         # l = loss_func(wo, rel_weight, by)
13         # print('%.2f%%, %.2f' % (a.cpu().data.numpy() * 100, l.detach().cpu().numpy()))
14         acc += a.cpu().data.numpy() * 100
15         loss += l.detach().cpu().numpy()
16         if optimizer is not None:
17             l.backward(), optimizer.step(), optimizer.zero_grad()
18     return acc / i, loss / i
19 
20 def data_unpack(cat_data, N):
21     list_x = np.split(cat_data.numpy(), [N, N + 1, N + 2, N + 3, N + 4, N + 5, 2 * N + 5], 1)
22     x = torch.from_numpy(list_x[0]).to(device)
23     e1 = torch.from_numpy(list_x[1]).to(device)
24     e1d2 = torch.from_numpy(list_x[2]).to(device)
25     e2 = torch.from_numpy(list_x[3]).to(device)
26     e2d1 = torch.from_numpy(list_x[4]).to(device)
27     zd = torch.from_numpy(list_x[5]).to(device)
28     d1 = torch.from_numpy(list_x[6]).to(device)
29     d2 = torch.from_numpy(list_x[7]).to(device)
30     return x, e1, e1d2, e2, e2d1, zd, d1, d2
31 
32 def prediction(wo, rel_weight, y, all_y):
33     '''
34     :param wo: [bs,500]
35     :param rel_weight: [19,500]
36     :param y: [bs,19]
37     :param all_y: [19,19]
38     :return:
39     '''
40     wo_norm = F.normalize(wo)  # default: dim=1 (bs, dc)
41     wo_norm_tile = wo_norm.unsqueeze(1).repeat(1, all_y.size()[0], 1)  # (bs, nr, dc)
42     #(bs,500)->(bs,1,500)->(bs,19,500)
43     ay_emb = torch.mm(all_y, rel_weight)  # (nr, dc)    这一步有点难理解在原文公式中对应着W_y^L
44     #(19,19)*(19*500)->(19,500)
45     dist = torch.norm(wo_norm_tile - ay_emb, 2, 2)  # (bs, nr)  torch.norm二范数运算
46     predict = torch.min(dist, 1)[1].long()    #在一行里选一个最小的nr下标返回  \delta_\theta(S,y)   shape:[bs]
47     y = torch.max(y, 1)[1]    #真实值的下标           #shape:[bs]
48     correct = torch.eq(predict, y)          #shape:[bs]
49     return correct.sum().float() / float(correct.data.size()[0])

实验跑了一下，效果就差的离谱，猜测应该是我的convert_to_onehot函数没有通过梯度下降更新。model_run()部分的第15行l.detach().cpu().numpy()阻断反向传播只是为了取出loss值来，不会影响梯度下降更新参数过程，

参考：

torch.nn.functional.normalize：https://blog.csdn.net/ECNU_LZJ/article/details/103653133?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-1.not_use_machine_learn_pai&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromBaidu-1.not_use_machine_learn_pai

维度dim = 0的理解： https://mathpretty.com/12065.html

torch.norm()用法：https://blog.csdn.net/qq_36556893/article/details/90698186

posted on 2021-01-20 20:46 Harukaze 阅读(363) 评论(0) 收藏举报

刷新页面返回顶部

Harukaze