Pytorch-暑假实验的一些方法记录

1.继承Dataset构建的类，按batch的最长句子数读取（而不是按训练集的最长，可以减少一丢丢显存）

以BiLSTM+CRF做分词所需的数据格式为例，重写collate_fn方法：

 1 class SegDataset(Dataset):
 2     def __init__(self, data_path, word2idx, word_pad_idx=0, label_pad_idx=-1):
 3         super(SegDataset_simple, self).__init__()
 4         self.texts = load_data(data_path)
 5         self.word2idx = word2idx
 6 
 7         self.word_pad_idx = word_pad_idx
 8         self.label_pad_idx = label_pad_idx
 9 
10     def __len__(self):
11         return len(self.texts)
12 
13     def __getitem__(self, index):            
14         text = self.texts[index]             #某一行数据，比如['西充县', '人民', '法院']，没有首尾符
15 
16         token_ids, label = [], []               
17         for item in text:
18             tokens = list(item)
19             token_ids.extend([self.word2idx.get(token, 1) for token in tokens])       #unk的索引为1
20             if len(tokens) == 1:
21                 label.append(1)
22             else:
23                 label.extend([0] * (len(tokens)-1) + [1])           #label：[0, 0, 1, 0, 1, 0, 1]
24 
25         return torch.tensor(token_ids), torch.tensor(label)
26     
27     
28     #按一个batch中的最长长度来填充
29     def collate_fn(self, batch):
30         token_ids = [x[0] for x in batch]
31         labels = [x[1] for x in batch]
32 
33         batch_len = len(token_ids)                          #batch长度
34         max_len = max([len(label) for label in labels])     #一个batch中的最长长度
35     
36         #padding data初始化
37         batch_data = self.word_pad_idx * np.ones((batch_len, max_len))
38         batch_labels = self.label_pad_idx * np.ones((batch_len, max_len))
39 
40         #padding and aligning
41         for j in range(batch_len):
42             cur_len = len(token_ids[j])
43             batch_data[j][:cur_len] = token_ids[j]
44             batch_labels[j][:cur_len] = labels[j]
45            
46         # convert data to torch LongTensors
47         batch_data = torch.tensor(batch_data, dtype=torch.long)
48         batch_labels = torch.tensor(batch_labels, dtype=torch.long)
49         return [batch_data, batch_labels]

2.根据词向量文件构建embedding矩阵

以下代码中词向量维度=300

 1 def load_embedding(fpath):
 2     word2idx = {}
 3     wordemb = []
 4     word2idx['<pad>'] = 0
 5     wordemb.append(np.random.uniform(-0.01, 0.01, EMBEDDING_DIM).tolist())
 6     word2idx['<unk>'] = 1
 7     wordemb.append(np.random.uniform(-0.01, 0.01, EMBEDDING_DIM).tolist())
 8     with open(fpath, 'r') as f:
 9 
10         for line in tqdm(f):
11             splt = line.split()
12             if len(splt)!=301:
13                 continue
14             vector = list(map(float, splt[-EMBEDDING_DIM:]))
15             word = splt[0]
16             if word not in word2idx:
17                 word2idx[word] = len(word2idx)
18                 wordemb.append(vector)
19     return word2idx, np.asarray(wordemb, np.float32)        
20 
21 
22 DICT_PATH = './data/embedding/token_vec_300.txt'
23 word2idx, wordemb = load_embedding(DICT_PATH)
24 pretrained_embdding = torch.from_numpy(wordemb)
25 print('pretrained_embdding=', pretrained_embdding.shape)
26 
27 #model.embedding.weight.data.copy_(pretrained_embdding)

3.torchcrf的使用

3.1计算损失部分

以中文分词的模型定义为例：

 1 from transformers import BertModel
 2 from transformers.models.bert.modeling_bert import *
 3 from torchcrf import CRF
 4 
 5 class BertSeg(BertPreTrainedModel):
 6     def __init__(self, config):
 7         super(BertSeg, self).__init__(config)
 8         self.num_labels = config.num_labels                        
 9         self.bert = BertModel(config)
10         self.layernorm = nn.LayerNorm(config.hidden_size)
11 
12         self.dropout = nn.Dropout(config.hidden_dropout_prob)
13         self.classifier = nn.Linear(config.hidden_size, self.num_labels)
14         self.crf = CRF(config.num_labels, batch_first=True)
15 
16         self.init_weights()
17 
18 
19     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
20                 position_ids=None, inputs_embeds=None, head_mask=None):
21     
22         outputs = self.bert(input_ids,                              #[batch, max_batch_len]
23                             attention_mask=attention_mask,
24                             token_type_ids=token_type_ids,
25                             position_ids=position_ids,
26                             head_mask=head_mask,
27                             inputs_embeds=inputs_embeds)
28         sequence_output = outputs[0]                                 #last_hidden_state, shape=[batch, max_batch_len, hidden_size]
29         
30        
31         origin_sequence_output = [layer[starts.nonzero().squeeze(1)]
32                                   for layer, starts in zip(sequence_output, attention_mask)]
33         # 将sequence_output的pred_label维度padding到最大长度
34         padded_sequence_output = pad_sequence(origin_sequence_output, batch_first=True)
35         padded_sequence_output = self.dropout(padded_sequence_output)
36         logits = self.classifier(padded_sequence_output)
37         
38         #sequence_output = self.dropout(sequence_output)                             
39         #logits = self.classifier(sequence_output)                    # [batch, max_batch_len, 2]
40         outputs = (logits,)
41 
42         '''
43         if labels is not None:                          #如果标签不空，用来训练
44             loss_fct = nn.CrossEntropyLoss()
45 
46             loss_mask = labels.gt(-1)                   #筛选出有效标签做loss, labels.gt(-1)
47             active_loss = loss_mask.view(-1) == 1
48                 
49             active_labels = labels.view(-1)[active_loss]                        #[batch*max_batch_len]
50             active_logits = logits.view(-1, self.num_labels)[active_loss]       #[batch*max_batch_len, 2]
51          
52             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
53             outputs = (loss,) + outputs
54         '''
55 
56         #CRF
57         if labels is not None:
58             loss_mask = labels.gt(-1)
59             loss = self.crf(logits, labels, loss_mask) * (-1)
60             outputs = (loss,) + outputs
61         
62 
63         return outputs

定义之后BertSeg方法的使用就和BertForSequenceClassification一样

3.2解码部分

 1 bert_config = BertConfig.from_pretrained(DIR + 'config.json', num_labels=NUM_LABELS)
 2 model = BertSeg.from_pretrained(DIR, config=bert_config)                       
 3 model_file = DIR + 'pytorch_model.bin'
 4 model.load_state_dict(torch.load(model_file), strict=False)
 5 
 6 #训练完成后使用CRF解码
 7 logits = model.crf.decode(pre_logits, mask=None)
 8 
 9 #普通解码
10 logits = torch.argmax(pre_logits, dim=-1)

4.稀疏矩阵全连接层

4.1 Sparse库的使用

 1 i = torch.LongTensor([[0, 2], [1, 0], [1, 2]])        #不是列表值，而是索引
 2 v = torch.FloatTensor([3, 4, 5])
 3 res = torch.sparse.FloatTensor(i.t(), v, torch.Size([2,3]))         #.to_dense()
 4 print(res.to_dense())       #Tensor类型
 5 #tensor([[0., 0., 3.],
 6 #        [4., 0., 5.]])
 7 
 8 print(res._indices())           
 9 #tensor([[0, 1, 1],
10 #        [2, 0, 2]])
11 print(res._values())        #tensor([3., 4., 5.])

4.2 改写nn.Linear函数，使之可以接受三元组作为输入

参考博客（对权重矩阵中的值归零）：https://blog.csdn.net/Kuo_Jun_Lin/article/details/115552545?utm_medium=distribute.pc_relevant.none-task-blog-baidujs_baidulandingword-0&spm=1001.2101.3001.4242

重写LinearFunction，官方文档：官方文档：https://pytorch.org/docs/stable/notes/extending.html

  1 class LinearFunction(torch.autograd.Function):
  2 
  3     # Note that both forward and backward are @staticmethods
  4     @staticmethod
  5     def forward(ctx, idxes, values, weight, bias=None):         #[b, max_len], [b, max_len], [2, vocab_size]       
  6         ctx.save_for_backward(idxes, values, weight, bias)
  7         output = torch.zeros((idxes.size(0), weight.size(0))).cuda()   #[b, 2]     
  8 
  9         for i in range(idxes.size(0)):                  #遍历batch
 10             rows = idxes[i]                             #所有非0值的索引，
 11             x = weight.t()[rows]                        #取出weight权重的非0值的行
 12 
 13             mask = values[i].gt(-1)
 14             mat = x[mask] * values[i].view(-1, 1)[mask]
 15             #mat = x * values[i]
 16             res = mat.sum(dim=0)
 17             output[i] = res    
 18 
 19         #output = input.mm(weight.t())
 20         if bias is not None:
 21             output += bias.unsqueeze(0).expand_as(output)
 22         return output
 23 
 24     # This function has only a single output, so it gets only one gradient
 25     @staticmethod
 26     def backward(ctx, grad_output):
 27         # This is a pattern that is very convenient - at the top of backward
 28         # unpack saved_tensors and initialize all gradients w.r.t. inputs to
 29         # None. Thanks to the fact that additional trailing Nones are ignored, 
 30         # the return statement is simple even when the function has optional inputs.
 31         idxes, values, weight, bias = ctx.saved_tensors
 32         grad_weight = grad_bias = None
 33 
 34         # These needs_input_grad checks are optional and there only to
 35         # improve efficiency. If you want to make your code simpler, you can skip them. 
 36         # Returning gradients for inputs that don't require it is not an error.
 37         
 38         #把idxes和values换成稀疏矩阵形式
 39         b = idxes.size(0)                                #[b, max_len]
 40         new_idxes = []
 41         for i in range(b):
 42             mask = values[i].gt(-1)
 43             new_idxes.extend([[i, val] for val in idxes[i][mask]])
 44             #new_idxes.extend([[i, val] for val in idxes[i]])
 45         all_mask = values.gt(-1)
 46         new_values = torch.FloatTensor(values[all_mask].cpu().numpy())      #tensor类型不能直接转为Float
 47         new_idxes = torch.LongTensor(new_idxes)
 48 
 49         input = torch.sparse.FloatTensor(new_idxes.t(), new_values, torch.Size([b, VOCAB_SIZE])).to_dense().cuda()             #
 50         if ctx.needs_input_grad[2]:        
 51             grad_weight = grad_output.t().mm(input)              
 52         if bias is not None and ctx.needs_input_grad[3]:
 53             grad_bias = grad_output.sum(0)
 54 
 55         return None, None, grad_weight, grad_bias                        #idxes和values没有梯度
 56 
 57 
 58 
 59 class CustomizedLinear(nn.Module):
 60     def __init__(self, input_features, output_features, bias=True):
 61         super(CustomizedLinear, self).__init__()
 62         self.input_features = input_features
 63         self.output_features = output_features
 64 
 65         # nn.Parameter is a special kind of Tensor, that will get
 66         # automatically registered as Module's parameter once it's assigned
 67         # as an attribute. Parameters and buffers need to be registered, or
 68         # they won't appear in .parameters() (doesn't apply to buffers), and
 69         # won't be converted when e.g. .cuda() is called. You can use
 70         # .register_buffer() to register buffers. nn.Parameters require gradients by default.
 71         self.weight = nn.Parameter(torch.empty(output_features, input_features))
 72         if bias:
 73             self.bias = nn.Parameter(torch.empty(output_features))
 74         else:
 75             # You should always register all possible parameters, but the optional ones can be None if you want.
 76             self.register_parameter('bias', None)
 77 
 78         # Not a very smart way to initialize weights
 79         self.weight.data.uniform_(-0.1, 0.1)
 80         if self.bias is not None:
 81             self.bias.data.uniform_(-0.1, 0.1)
 82 
 83     def forward(self, idxes, values):
 84         # See the autograd section for explanation of what happens here.
 85         return LinearFunction.apply(idxes, values, self.weight, self.bias)
 86 
 87     def extra_repr(self):
 88         # (Optional)Set the extra information about this module. You can test it by printing an object of this class.
 89         return 'input_features={}, output_features={}, bias={}'.format(
 90             self.input_features, self.output_features, self.bias is not None
 91         )
 92 
 93 
 94 class FcModel(nn.Module):
 95     def __init__(self, vocab_size, hidden_dim, output_size):        #vocab_size=单词表的长度
 96         super(FcModel, self).__init__()
 97         self.fc = CustomizedLinear(vocab_size, output_size)
 98         
 99     def forward(self, idxes, values):           # [batch, seq_len]
100         x = self.fc(idxes, values)
101         return x

5.tensor类型转numpy，再转list

1 if logits是tensor类型
2 logits.detach().cpu().numpy().tolist()

6.tensor类型翻转函数.flip()

1 x = idxes = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
2 print(x.flip(dims=[0]))         #tensor([[ 6,  7,  8,  9, 10], [ 1,  2,  3,  4,  5]])
3 print(x.flip(dims=[1]))         #tensor([[ 5,  4,  3,  2,  1], [10,  9,  8,  7,  6]])

7.被pad填充过的batch做损失（需要mask被填充部分）

 1 loss_fct = nn.CrossEntropyLoss()
 2 loss_mask = labels.gt(-1)                   #筛选出有效标签做loss
 3   
 4 active_loss = loss_mask.view(-1) == 1
 5                   
 6 active_labels = torch.where(active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels))
 7 active_logits = logits.view(-1, self.num_labels)
 8  
 9 loss = loss_fct(active_logits, active_labels)
10  
11 #或者下面的写法
12 #active_labels = labels.view(-1)[active_loss]                              #[batch*max_batch_len]
13 #active_logits = logits.view(-1, self.num_labels)[active_loss]             #[batch*max_batch_len, 2] 
14 #loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

8.heapq堆排序调库

 1 import heapq
 2 
 3 pre_values= [0.9983519315719604, 0.9670383334159851, 0.5502673387527466, 0.9989173412322998, 0.9934349060058594, 0.9387616515159607, 0.8802109956741333, 0.8454128503799438, 0.9943495392799377, 0.9983519315719604, 0.9847546815872192, 0.9931608438491821, 0.9957965612411499, 0.9994950294494629, 0.724168598651886, 0.6788691878318787, 0.9875763654708862, 0.9983518123626709, 0.9984715580940247, 0.9971969127655029, 0.9934577345848083, 0.9863535165786743, 0.995525062084198, 0.6242473125457764, 0.8157297968864441, 0.9625718593597412, 0.9997809529304504, 0.9988841414451599, 0.9907870888710022, 0.9848848581314087, 0.9409478902816772, 0.9791840314865112, 0.9999555349349976, 0.9967131614685059, 0.9983519315719604, 0.9056358933448792, 0.9983962178230286, 0.657272458076477, 0.9997139573097229, 0.9874300360679626, 0.9905149936676025, 0.8445992469787598, 0.9971678853034973, 0.9983079433441162, 0.9997274279594421, 0.9978759288787842, 0.9941542744636536, 0.8210688829421997, 0.8894577026367188, 0.9996976852416992]
 4 
 5 #前topK个值
 6 pre = heapq.nlargest(10, pre_values)
 7 #[0.9999555349349976, 0.9997809529304504, 0.9997274279594421, 0.9997139573097229, 0.9996976852416992, 0.9994950294494629, 0.9989173412322998, 0.9988841414451599, 0.9984715580940247, 0.9983962178230286]
 8 
 9 #前topK个索引
10 pre_idxes = heapq.nlargest(10, range(len(pre_values)), pre_values.__getitem__)              #[32, 26, 44, 38, 49, 13, 3, 27, 18, 36]

9.按相同顺序对不同数组乱序排序

应用场景：中英文语料（一一对应，标签一致）在不同文件里，如果需要乱序，两个列表应该按相同顺序打乱；

光设定随机种子random.seed()是没有用的，两次random.shuffle()随机顺序不一致。

 1 import numpy as np
 2 
 3 cont_ch, cont_en = np.array(cont_ch), np.array(cont_en)
 4 label_ch = np.array(label)
 5 
 6 state = np.random.get_state()
 7 np.random.shuffle(cont_ch)
 8 np.random.set_state(state)
 9 np.random.shuffle(cont_en)
10 np.random.set_state(state)
11 np.random.shuffle(label)

先获得状态get_state()，再设定状态set_state()，只有np.random有这两个函数，random库没有，需要把数据先转换成numpy类型。

10.csv文件的读取与写入

 1 def read(input_file):
 2     with open(input_file, "r", encoding="utf-8") as f:
 3         reader = csv.reader(f, quotechar='"')       
 4         lines = []
 5         for line in reader:
 6             lines.append(line)              #label-->int(line[0])，text-->line[1:]
 7         return lines
 8 
 9 
10 def write(out_file, content):
11     with open(out_file, 'w', encoding='utf-8') as f:
12         writer = csv.writer(f)
13         writer.writerows(content)
14         print('写入成功!')

如果读取的文件原本不是CSV格式，而是content和label（均为list格式）

 1 def convert(x, y):                   #x是text，y是class
 2     rows = []
 3     for i in range(len(y)):
 4         dic={}
 5         dic['class'] = y[i]
 6         dic['text'] = x[i][:-1]      #为了删掉换行符\n，有的情况不需要
 7         rows.append(dic)
 8     return rows
 9 
10 def get_data(content, all_label, new_path):
11 
12     all_dic = convert(content, all_label)
13     headers = ['class', 'text']
14     with open(new_path, 'w') as f:
15         f_csv = csv.DictWriter(f, headers)
16         f_csv.writerows(all_dic)
17 
18 get_data(contents, labels, './sst-2/dev.csv')

11.csv文件转tsv文件

1 import pandas as pd
2 pd_all = pd.read_csv("./yelp/unlabel_8+train.csv", sep=',', encoding='utf-8') 
3 # 保存为tsv文件
4 pd_all.to_csv("./yelp_tsv/unlabel_8+train.tsv", index=False, sep='\t', encoding='utf-8')

12.调用百度API对文档一句一句地翻译

 1 #百度通用翻译API
 2 # coding=utf-8
 3 
 4 import http.client
 5 import hashlib
 6 import urllib
 7 import random
 8 import json
 9 
10 appid = '填写你的appid' 
11 secretKey = '填写你的密钥'  
12 
13 httpClient = None
14 
15 def baiduAPI(line, fromLang='auto', toLang='zh', myurl='/api/trans/vip/translate'):
16     salt = random.randint(32768, 65536)
17     q= line
18     sign = appid + q + str(salt) + secretKey
19     sign = hashlib.md5(sign.encode()).hexdigest()
20     myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(
21     salt) + '&sign=' + sign
22 
23     
24     httpClient = http.client.HTTPConnection('api.fanyi.baidu.com')
25     httpClient.request('GET', myurl)
26 
27     # response是HTTPResponse对象
28     response = httpClient.getresponse()
29     result_all = response.read().decode("utf-8")
30     result = json.loads(result_all)
31     #print(result)
32     return result['trans_result'][0]['dst']

id和密钥通过官方网站申请

13.matplotlib绘制性能趋势图

参考博客：https://www.cnblogs.com/douzujun/p/14974164.html

 1 import numpy as np 
 2 import matplotlib.pyplot as plt 
 3 import matplotlib
 4 matplotlib.matplotlib_fname()
 5 
 6 print(matplotlib.get_configdir())
 7 
 8 
 9 with plt.style.context(['no-latex']):
10     #x = np.linspace(0.0, 10.0)
11     #y = np.sin(x)
12     x = np.array([20, 40, 60, 80])
13     pred = np.array([[87.55, 87.55, 87.55, 87.55],
14                      [84.78, 86.6, 86.89, 86.85],
15                      [85.26, 86.2, 86.08, 86.75],
16                      [85.55, 86.75, 87.59, 87.8]
17              ])
18     styles = ['-', ':', '--', '-.']
19     labels = ['Approch1', 'Approch2', 'Approch3', 'Approch4']
20     colors = ['b', 'y', 'c', 'r']
21     for p, style, label, color in zip(pred, styles, labels, colors):
22         plt.plot(x, p, label=label, linestyle=style, color=color, linewidth=2.5)
23 
24     #plt.grid(True)         #添加网格 
25     #plt.ylim((83, 90))     #设定y轴上下限
26     #plt.tight_layout()
27     plt.xticks(fontsize=18)
28     plt.yticks(fontsize=18)
29     #plt.title("Cross-Language", fontsize=18)
30     plt.xlabel("The number of unlabeled data", fontsize=18)
31     plt.ylabel("F1-score", fontsize=16)
32 
33     import matplotlib.ticker as mticker 
34     plt.gca().xaxis.set_major_formatter(mticker.FormatStrFormatter('%dk'))
35     plt.gca().yaxis.set_major_formatter(mticker.FormatStrFormatter('%.2f %%'))
36     plt.legend(edgecolor = 'k', loc='lower right', fontsize=18)         #loc='lower right'位置固定在右下角
37     plt.show()

运行效果如下：

14. 集成模型中设定各模型的权重区间为[0,1]且所有权重和为1

 1 class ensembleCNN(nn.Module):
 2     def __init__(self):
 3         super(ensembleCNN, self).__init__()
 4         self.embedding = nn.Embedding(len(vocabulary), EMBEDDING_SIZE)
 5         self.model1 = TextCNN(len(vocabulary), EMBEDDING_SIZE, NUM_LABELS, SENTENCE_LIMIT_SIZE)
 6         self.model2 = TextCNN(len(vocabulary), EMBEDDING_SIZE, NUM_LABELS, SENTENCE_LIMIT_SIZE)
 7         self.model3 = TextCNN(len(vocabulary), EMBEDDING_SIZE, NUM_LABELS, SENTENCE_LIMIT_SIZE)
 8         self.weight = nn.Parameter(torch.rand(NUM_MODELS))          #取3个随机数，区间[0,1]
 9       
10     def forward(self, x1, x2, x3):
11         x1 = self.embedding(x1)
12         x2 = self.embedding(x2)
13         x3 = self.embedding(x3)
14 
15         out1 = self.model1(x1)
16         out2 = self.model2(x2)
17         out3 = self.model3(x3)
18 
19         w = F.softmax(self.weight)                              #为了权重和为1
20         pred_final = w[0] * out1 + w[1] * out2 + w[2] * out3
21         return out1, out2, out3, pred_final

15. 由类别数字转成编码形式（比如4分类中，类别2对应[0,0,1,0]）

1 y = np.zeros(self.num_classes).astype(np.float32)   
2 y[label] = 1.0

posted @ 2021-10-06 20:51 最咸的鱼阅读(37) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部