pytorch-day08(RNN)
pytorch处理的都是数据类型,不支持string类型的处理;对于string类型需要转换为其他类型处理,这种表示称为representation or word embeding;
Sequence representation:
例如:
one-hot编码:稀疏,占了大量的空间,高维

semantic similarity(语义相似性):有glove and word2vec方法

Batch:

word2vec vs GloVe:
word2vec:
1 word_to_ix = {"hello": 0, "world": 1} 2 lookup = torch.tensor([word_to_ix["hello"]], dtype=torch.long) 3 4 embeds = nn.Embedding(2, 5) 5 hello_embed = embeds(lookup) 6 print(hello_embed) # tensor([[-1.5328, 0.5370, -0.1833, 1.1079, -0.0814]],
Glove:

Sentiment Analysis(情感分析):

缺点:

解决方案1:

解决方案2:


1 import torch 2 from torch import nn 3 4 rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=1) # 100:每个单词的特征数 20:memory/h 5 # print(rnn._parameters.keys()) # odict_keys(['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0']) 6 7 print(rnn.weight_ih_l0.shape) # torch.Size([20, 100]) 8 print(rnn.weight_hh_l0.shape) # torch.Size([20, 20]) 9 print(rnn.bias_ih_l0.shape) # torch.Size([20]) 10 print(rnn.bias_hh_l0.shape) # torch.Size([20])


以上的shape计算方式适用于多层的RNN
两层的RNN:






1 import torch 2 from torch import nn 3 4 def main(): 5 rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=1) # 100:每个单词的特征数 20:memory/h 6 print(rnn) # RNN(100, 20) 7 x = torch.randn(10, 3, 100) 8 out, h = rnn(x, torch.zeros(1, 3, 20)) 9 print(out.shape, h.shape) # torch.Size([10, 3, 20]) torch.Size([1, 3, 20]) 10 11 rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=4) 12 print(rnn) # torch.Size([10, 3, 20]) torch.Size([1, 3, 20]) 13 x = torch.randn(10, 3, 100) 14 out, h = rnn(x, torch.zeros(4, 3, 20)) 15 print(out.shape, h.shape) # torch.Size([10, 3, 20]) torch.Size([4, 3, 20]) 16 # print(vars(rnn)) 17 18 print('rnn by cell') 19 20 cell1 = nn.RNNCell(100, 20) 21 h1 = torch.zeros(3, 20) 22 for xt in x: 23 h1 = cell1(xt, h1) 24 print(h1.shape) # torch.Size([3, 20]) 25 26 cell1 = nn.RNNCell(100, 30) 27 cell2 = nn.RNNCell(30, 20) 28 h1 = torch.zeros(3, 30) 29 h2 = torch.zeros(3, 20) 30 for xt in x: 31 h1 = cell1(xt, h1) 32 h2 = cell2(h1, h2) 33 print(h2.shape) # torch.Size([3, 20]) 34 35 print('Lstm') 36 lstm = nn.LSTM(input_size=100, hidden_size=20, num_layers=4) 37 print(lstm) # LSTM(100, 20, num_layers=4) 38 x = torch.randn(10, 3, 100) 39 out, (h, c) = lstm(x) 40 print(out.shape, h.shape, c.shape) # torch.Size([10, 3, 20]) torch.Size([4, 3, 20]) torch.Size([4, 3, 20]) 41 42 print('one layer lstm') 43 cell = nn.LSTMCell(input_size=100, hidden_size=20) 44 h = torch.zeros(3, 20) 45 c = torch.zeros(3, 20) 46 for xt in x: 47 h, c = cell(xt, [h, c]) 48 print(h.shape, c.shape) # torch.Size([3, 20]) torch.Size([3, 20]) 49 50 print('two layer lstm') 51 cell1 = nn.LSTMCell(input_size=100, hidden_size=30) 52 cell2 = nn.LSTMCell(input_size=30, hidden_size=20) 53 h1 = torch.zeros(3, 30) 54 c1 = torch.zeros(3, 30) 55 h2 = torch.zeros(3, 20) 56 c2 = torch.zeros(3, 20) 57 for xt in x: 58 h1, c1 = cell1(xt, [h1, c1]) 59 h2, c2 = cell2(h1, [h2, c2]) 60 print(h2.shape, c2.shape) # torch.Size([3, 20]) torch.Size([3, 20]) 61 62 63 if __name__ == '__main__': 64 main()
上面描绘的输入数据格式为(seq_len, batch, input_size)
在这个部分我们使用另一种输入格式(batch, seq_len, input_size)来实现一个小例子:
拟合一条如下的曲线:

1 import numpy as np 2 import torch 3 from torch import nn 4 from torch import optim 5 from matplotlib import pyplot as plt 6 7 num_time_steps = 50 # 输入数据的个数 8 input_size = 1 # 输入数据的维度 9 hidden_size = 16 # hidden/memory的size 10 output_size = 1 # 预测值的size 11 lr = 0.01 12 13 14 class Net(nn.Module): 15 def __init__(self, ): 16 super(Net, self).__init__() 17 self.rnn = nn.RNN( 18 input_size=input_size, 19 hidden_size=hidden_size, 20 num_layers=1, 21 batch_first=True, 22 ) 23 # for p in self.rnn.parameters(): 24 # nn.init.normal_(p, mean=0.0, std=0.001) 25 26 self.liner = nn.Linear(hidden_size, output_size) 27 28 def forward(self, x, hidden_prev): # x:(batch, seq_len ,input_size) 29 # out:每个t时刻,h的输出 (batch, seq_len, hidden_size) 30 # hidden_prev:最后一个时刻h的输出 (batch, num_layers, hidden_size) 31 out, hidden_prev = self.rnn(x, hidden_prev) 32 33 out = out.view(-1, hidden_size) # (batch*seq_len, hidden_size) 34 out = self.liner(out) # (batch*seq_len, 1) 35 out = out.unsqueeze(dim=0) # (1, batch*seq_len, 1) 36 return out, hidden_prev 37 38 39 model = Net() 40 criterion = nn.MSELoss() 41 optimizer = optim.Adam(model.parameters(), lr) 42 43 hidden_prev = torch.zeros(1, 1, hidden_size) # (batch, num_layers, hidden_size) 44 45 for iter in range(600): 46 start = np.random.randint(3, size=1)[0] # 随机生成0-2之间的一个整数 47 time_steps = np.linspace(start, start + 10, num_time_steps) # 每次送入num_time_steps个输入 48 data = np.sin(time_steps) # 输出 49 data = data.reshape(num_time_steps, 1) # (50, 1)(即,隔以为进行预测) 50 # 已知0-48的输出,去预测1-49的输出 51 x = torch.tensor(data[:-1]).float().view(1, num_time_steps - 1, 52 1) # (1, 49, 1)===>(batch, seq_len ,input_size) (0-48) 53 y = torch.tensor(data[1:]).float().view(1, num_time_steps - 1, 54 1) # (1, 49, 1)===>(batch, seq_len ,input_size) (1-49) 55 # 注意这里的x,我们训练的时候用0-48的数据,在计算损失函数的时候用1-49的数据 56 pred, hidden_prev = model(x, hidden_prev) 57 hidden_prev = hidden_prev.detach() # x.detach()新分离出来的tensor的requires_grad=False 58 # 注意这里计算损失函数,用的是y,不是x 59 loss = criterion(pred, y) 60 model.zero_grad() 61 loss.backward() 62 # for p in model.parameters(): 63 # print(p.grad.norm()) 64 # # 既然在BP过程中会产生梯度消失/爆炸(就是偏导无限接近0,导致长时记忆无法更新), 65 # # 那么最简单粗暴的方法,设定阈值,当梯度小于/大于阈值时,更新的梯度为阈值 66 # torch.nn.utils.clip_grad_norm(p, 10) 67 optimizer.step() 68 69 if iter % 100 == 0: 70 print("Iteration {} loss {}".format(iter, loss.item())) 71 72 # 生成作图数据 73 start = np.random.randint(3, size=1)[0] 74 time_steps = np.linspace(start, start + 10, num_time_steps) 75 data = np.sin(time_steps) 76 data = data.reshape(num_time_steps, 1) 77 x = torch.tensor(data[:-1]).float().view(1, num_time_steps - 1, 1) 78 y = torch.tensor(data[1:]).float().view(1, num_time_steps - 1, 1) 79 80 predictions = [] 81 input = x[:, 0, :] # 取x的初始点 82 83 for _ in range(x.shape[1]): # 49 84 input = input.view(1, 1, 1) 85 (pred, hidden_prev) = model(input, hidden_prev) # 用x0去预测x1,接着又用x1去预测x2,... 86 input = pred # 训练的seq_len为48,预测的seq_len为1,其他都相同 87 predictions.append(pred.detach().numpy().ravel()[0]) # ravel():扁平化,一维 88 89 x = x.data.numpy().ravel() 90 y = y.data.numpy() 91 plt.scatter(time_steps[:-1], x.ravel(), s=90, c="r") # 真实 92 plt.plot(time_steps[:-1], x.ravel()) 93 94 plt.scatter(time_steps[1:], predictions) # 预测 95 plt.show()
结果:

梯度爆炸(无穷大):
解决方案(简单粗暴):
1 loss = criterion(pred, y) 2 model.zero_grad() 3 loss.backward() 4 for p in model.parameters(): 5 print(p.grad.norm()) 6 # 既然在BP过程中会产生梯度消失/爆炸(就是偏导无限接近0,导致长时记忆无法更新), 7 # 那么最简单粗暴的方法,设定阈值,当梯度小于/大于阈值时,更新的梯度为阈值,即把梯度控制在10以内 8 torch.nn.utils.clip_grad_norm(p, 10) 9 optimizer.step()
梯度离散(接近于0):
参数长时间得不到更新。解决方案:
RNN的问题:梯度离散;记性不好
2、LSTM



LSTM的实现:







情感分类实战:





1 # K80 gpu for 12 hours 2 import torch 3 import numpy as np 4 import spacy 5 from torch import nn, optim 6 from torchtext import data, datasets 7 8 torch.manual_seed(123) 9 10 TEXT = data.Field(tokenize='spacy') 11 LABEL = data.LabelField(dtype=torch.float) 12 train_data, test_data = datasets.IMDB.splits(TEXT, LABEL) 13 14 print('len of train data:', len(train_data)) 15 print('len of test data:', len(test_data)) 16 17 print(train_data.examples[15].text) 18 print(train_data.examples[15].label) 19 20 # word2vec, glove 21 TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d') 22 LABEL.build_vocab(train_data) 23 24 batchsz = 30 25 # device = torch.device('cuda') 26 train_iterator, test_iterator = data.BucketIterator.splits( 27 (train_data, test_data), 28 batch_size=batchsz, 29 # device=device 30 ) 31 32 33 class RNN(nn.Module): 34 def __init__(self, vocab_size, embedding_dim, hidden_dim): 35 super(RNN, self).__init__() 36 37 # [0-10001] => [100] 38 self.embedding = nn.Embedding(vocab_size, embedding_dim) 39 # [100] => [256] 40 self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, 41 bidirectional=True, dropout=0.5) 42 # [256*2] => [1] 43 self.fc = nn.Linear(hidden_dim * 2, 1) 44 self.dropout = nn.Dropout(0.5) 45 46 def forward(self, x): 47 """ 48 x: [seq_len, b] vs [b, 3, 28, 28] 49 """ 50 # [seq, b, 1] => [seq, b, 100] 51 embedding = self.dropout(self.embedding(x)) 52 53 # output: [seq, b, hid_dim*2] 54 # hidden/h: [num_layers*2, b, hid_dim] 55 # cell/c: [num_layers*2, b, hid_di] 56 output, (hidden, cell) = self.rnn(embedding) 57 58 # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2] 59 hidden = torch.cat([hidden[-2], hidden[-1]], dim=1) 60 61 # [b, hid_dim*2] => [b, 1] 62 hidden = self.dropout(hidden) 63 out = self.fc(hidden) 64 65 return out 66 67 68 rnn = RNN(len(TEXT.vocab), 100, 256) 69 70 pretrained_embedding = TEXT.vocab.vectors 71 print('pretrained_embedding:', pretrained_embedding.shape) 72 rnn.embedding.weight.data.copy_(pretrained_embedding) 73 print('embedding layer inited.') 74 75 optimizer = optim.Adam(rnn.parameters(), lr=1e-3) 76 criteon = nn.BCEWithLogitsLoss() 77 # rnn.to(device) 78 79 80 def binary_acc(preds, y): 81 """ 82 get accuracy 83 """ 84 preds = torch.round(torch.sigmoid(preds)) 85 correct = torch.eq(preds, y).float() 86 acc = correct.sum() / len(correct) 87 return acc 88 89 90 def train(rnn, iterator, optimizer, criteon): 91 avg_acc = [] 92 rnn.train() 93 94 for i, batch in enumerate(iterator): 95 96 # [seq, b] => [b, 1] => [b] 97 pred = rnn(batch.text).squeeze(1) 98 # 99 loss = criteon(pred, batch.label) 100 acc = binary_acc(pred, batch.label).item() 101 avg_acc.append(acc) 102 103 optimizer.zero_grad() 104 loss.backward() 105 optimizer.step() 106 107 if i % 10 == 0: 108 print(i, acc) 109 110 avg_acc = np.array(avg_acc).mean() 111 print('avg acc:', avg_acc) 112 113 114 def eval(rnn, iterator, criteon): 115 avg_acc = [] 116 117 rnn.eval() 118 119 with torch.no_grad(): 120 for batch in iterator: 121 # [b, 1] => [b] 122 pred = rnn(batch.text).squeeze(1) 123 loss = criteon(pred, batch.label) 124 125 acc = binary_acc(pred, batch.label).item() 126 avg_acc.append(acc) 127 128 avg_acc = np.array(avg_acc).mean() 129 130 print('>>test:', avg_acc) 131 132 133 for epoch in range(10): 134 eval(rnn, test_iterator, criteon) 135 train(rnn, train_iterator, optimizer, criteon)

浙公网安备 33010602011771号