pytorch-day08(RNN)

 pytorch处理的都是数据类型,不支持string类型的处理;对于string类型需要转换为其他类型处理,这种表示称为representation or word embeding;

 Sequence representation:

 例如:

    

 one-hot编码:稀疏,占了大量的空间,高维

  

 semantic similarity(语义相似性):有glove and word2vec方法

  

 Batch:

  

 word2vec vs GloVe:

  word2vec: 

1 word_to_ix = {"hello": 0, "world": 1}
2 lookup = torch.tensor([word_to_ix["hello"]], dtype=torch.long)
3 
4 embeds = nn.Embedding(2, 5)
5 hello_embed = embeds(lookup)
6 print(hello_embed)  # tensor([[-1.5328,  0.5370, -0.1833,  1.1079, -0.0814]],

  Glove:

    

 

Sentiment Analysis(情感分析):

  

  缺点:

    

  解决方案1:

    

  解决方案2:

    

      

 1 import torch
 2 from torch import nn
 3 
 4 rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=1)  # 100:每个单词的特征数 20:memory/h
 5 # print(rnn._parameters.keys())  # odict_keys(['weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0'])
 6 
 7 print(rnn.weight_ih_l0.shape)  # torch.Size([20, 100])
 8 print(rnn.weight_hh_l0.shape)  # torch.Size([20, 20])
 9 print(rnn.bias_ih_l0.shape)  # torch.Size([20])
10 print(rnn.bias_hh_l0.shape)  # torch.Size([20])

  

  

  以上的shape计算方式适用于多层的RNN

     

  两层的RNN:

    

      

    


  

      

    

    

 1 import torch
 2 from torch import nn
 3 
 4 def main():
 5     rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=1)  # 100:每个单词的特征数 20:memory/h
 6     print(rnn)  # RNN(100, 20)
 7     x = torch.randn(10, 3, 100)
 8     out, h = rnn(x, torch.zeros(1, 3, 20))
 9     print(out.shape, h.shape)  # torch.Size([10, 3, 20]) torch.Size([1, 3, 20])
10 
11     rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=4)
12     print(rnn)  # torch.Size([10, 3, 20]) torch.Size([1, 3, 20])
13     x = torch.randn(10, 3, 100)
14     out, h = rnn(x, torch.zeros(4, 3, 20))
15     print(out.shape, h.shape)  # torch.Size([10, 3, 20]) torch.Size([4, 3, 20])
16     # print(vars(rnn))
17 
18     print('rnn by cell')
19 
20     cell1 = nn.RNNCell(100, 20)
21     h1 = torch.zeros(3, 20)
22     for xt in x:
23         h1 = cell1(xt, h1)
24     print(h1.shape)  # torch.Size([3, 20])
25 
26     cell1 = nn.RNNCell(100, 30)
27     cell2 = nn.RNNCell(30, 20)
28     h1 = torch.zeros(3, 30)
29     h2 = torch.zeros(3, 20)
30     for xt in x:
31         h1 = cell1(xt, h1)
32         h2 = cell2(h1, h2)
33     print(h2.shape)  # torch.Size([3, 20])
34 
35     print('Lstm')
36     lstm = nn.LSTM(input_size=100, hidden_size=20, num_layers=4)
37     print(lstm)  # LSTM(100, 20, num_layers=4)
38     x = torch.randn(10, 3, 100)
39     out, (h, c) = lstm(x)
40     print(out.shape, h.shape, c.shape)  # torch.Size([10, 3, 20]) torch.Size([4, 3, 20]) torch.Size([4, 3, 20])
41 
42     print('one layer lstm')
43     cell = nn.LSTMCell(input_size=100, hidden_size=20)
44     h = torch.zeros(3, 20)
45     c = torch.zeros(3, 20)
46     for xt in x:
47         h, c = cell(xt, [h, c])
48     print(h.shape, c.shape)  # torch.Size([3, 20]) torch.Size([3, 20])
49 
50     print('two layer lstm')
51     cell1 = nn.LSTMCell(input_size=100, hidden_size=30)
52     cell2 = nn.LSTMCell(input_size=30, hidden_size=20)
53     h1 = torch.zeros(3, 30)
54     c1 = torch.zeros(3, 30)
55     h2 = torch.zeros(3, 20)
56     c2 = torch.zeros(3, 20)
57     for xt in x:
58         h1, c1 = cell1(xt, [h1, c1])
59         h2, c2 = cell2(h1, [h2, c2])
60     print(h2.shape, c2.shape)  # torch.Size([3, 20]) torch.Size([3, 20])
61 
62 
63 if __name__ == '__main__':
64     main()

 上面描绘的输入数据格式为(seq_len, batch, input_size)

  

 在这个部分我们使用另一种输入格式(batch, seq_len, input_size)来实现一个小例子:

  拟合一条如下的曲线:

    

 1 import numpy as np
 2 import torch
 3 from torch import nn
 4 from torch import optim
 5 from matplotlib import pyplot as plt
 6 
 7 num_time_steps = 50  # 输入数据的个数
 8 input_size = 1  # 输入数据的维度
 9 hidden_size = 16  # hidden/memory的size
10 output_size = 1  # 预测值的size
11 lr = 0.01
12 
13 
14 class Net(nn.Module):
15     def __init__(self, ):
16         super(Net, self).__init__()
17         self.rnn = nn.RNN(
18             input_size=input_size,
19             hidden_size=hidden_size,
20             num_layers=1,
21             batch_first=True,
22         )
23         # for p in self.rnn.parameters():
24         #     nn.init.normal_(p, mean=0.0, std=0.001)
25 
26         self.liner = nn.Linear(hidden_size, output_size)
27 
28     def forward(self, x, hidden_prev):  # x:(batch, seq_len ,input_size)
29         # out:每个t时刻,h的输出 (batch, seq_len, hidden_size)
30         # hidden_prev:最后一个时刻h的输出 (batch, num_layers, hidden_size)
31         out, hidden_prev = self.rnn(x, hidden_prev)
32 
33         out = out.view(-1, hidden_size)  # (batch*seq_len, hidden_size)
34         out = self.liner(out)  # (batch*seq_len, 1)
35         out = out.unsqueeze(dim=0)  # (1, batch*seq_len, 1)
36         return out, hidden_prev
37 
38 
39 model = Net()
40 criterion = nn.MSELoss()
41 optimizer = optim.Adam(model.parameters(), lr)
42 
43 hidden_prev = torch.zeros(1, 1, hidden_size)  # (batch, num_layers, hidden_size)
44 
45 for iter in range(600):
46     start = np.random.randint(3, size=1)[0]  # 随机生成0-2之间的一个整数
47     time_steps = np.linspace(start, start + 10, num_time_steps)  # 每次送入num_time_steps个输入
48     data = np.sin(time_steps)  # 输出
49     data = data.reshape(num_time_steps, 1)  # (50, 1)(即,隔以为进行预测)
50     # 已知0-48的输出,去预测1-49的输出
51     x = torch.tensor(data[:-1]).float().view(1, num_time_steps - 1,
52                                              1)  # (1, 49, 1)===>(batch, seq_len ,input_size) (0-48)
53     y = torch.tensor(data[1:]).float().view(1, num_time_steps - 1,
54                                             1)  # (1, 49, 1)===>(batch, seq_len ,input_size) (1-49)
55     # 注意这里的x,我们训练的时候用0-48的数据,在计算损失函数的时候用1-49的数据
56     pred, hidden_prev = model(x, hidden_prev)
57     hidden_prev = hidden_prev.detach()  # x.detach()新分离出来的tensor的requires_grad=False
58     # 注意这里计算损失函数,用的是y,不是x
59     loss = criterion(pred, y)
60     model.zero_grad()
61     loss.backward()
62     # for p in model.parameters():
63     #     print(p.grad.norm())
64     # # 既然在BP过程中会产生梯度消失/爆炸(就是偏导无限接近0,导致长时记忆无法更新),
65     # # 那么最简单粗暴的方法,设定阈值,当梯度小于/大于阈值时,更新的梯度为阈值
66     # torch.nn.utils.clip_grad_norm(p, 10)
67     optimizer.step()
68 
69     if iter % 100 == 0:
70         print("Iteration {} loss {}".format(iter, loss.item()))
71 
72 # 生成作图数据
73 start = np.random.randint(3, size=1)[0]
74 time_steps = np.linspace(start, start + 10, num_time_steps)
75 data = np.sin(time_steps)
76 data = data.reshape(num_time_steps, 1)
77 x = torch.tensor(data[:-1]).float().view(1, num_time_steps - 1, 1)
78 y = torch.tensor(data[1:]).float().view(1, num_time_steps - 1, 1)
79 
80 predictions = []
81 input = x[:, 0, :]  # 取x的初始点
82 
83 for _ in range(x.shape[1]):  # 49
84     input = input.view(1, 1, 1)
85     (pred, hidden_prev) = model(input, hidden_prev)  # 用x0去预测x1,接着又用x1去预测x2,...
86     input = pred  # 训练的seq_len为48,预测的seq_len为1,其他都相同
87     predictions.append(pred.detach().numpy().ravel()[0])  # ravel():扁平化,一维
88 
89 x = x.data.numpy().ravel()
90 y = y.data.numpy()
91 plt.scatter(time_steps[:-1], x.ravel(), s=90, c="r")  # 真实
92 plt.plot(time_steps[:-1], x.ravel())
93 
94 plt.scatter(time_steps[1:], predictions)  # 预测
95 plt.show()

  结果:

    


 梯度爆炸(无穷大):

  解决方案(简单粗暴):

1     loss = criterion(pred, y)
2     model.zero_grad()
3     loss.backward()
4     for p in model.parameters():
5         print(p.grad.norm())
6     # 既然在BP过程中会产生梯度消失/爆炸(就是偏导无限接近0,导致长时记忆无法更新),
7     # 那么最简单粗暴的方法,设定阈值,当梯度小于/大于阈值时,更新的梯度为阈值,即把梯度控制在10以内
8         torch.nn.utils.clip_grad_norm(p, 10)
9     optimizer.step()

 梯度离散(接近于0):

  参数长时间得不到更新。解决方案:

  RNN的问题:梯度离散;记性不好

2、LSTM

  

  

  

  

  LSTM的实现:

  

 

  

  

   

  

  

  

  情感分类实战:

    

      

      

      

      

  1 # K80 gpu for 12 hours
  2 import torch
  3 import numpy as np
  4 import spacy
  5 from torch import nn, optim
  6 from torchtext import data, datasets
  7 
  8 torch.manual_seed(123)
  9 
 10 TEXT = data.Field(tokenize='spacy')
 11 LABEL = data.LabelField(dtype=torch.float)
 12 train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
 13 
 14 print('len of train data:', len(train_data))
 15 print('len of test data:', len(test_data))
 16 
 17 print(train_data.examples[15].text)
 18 print(train_data.examples[15].label)
 19 
 20 # word2vec, glove
 21 TEXT.build_vocab(train_data, max_size=10000, vectors='glove.6B.100d')
 22 LABEL.build_vocab(train_data)
 23 
 24 batchsz = 30
 25 # device = torch.device('cuda')
 26 train_iterator, test_iterator = data.BucketIterator.splits(
 27     (train_data, test_data),
 28     batch_size=batchsz,
 29     # device=device
 30 )
 31 
 32 
 33 class RNN(nn.Module):
 34     def __init__(self, vocab_size, embedding_dim, hidden_dim):
 35         super(RNN, self).__init__()
 36 
 37         # [0-10001] => [100]
 38         self.embedding = nn.Embedding(vocab_size, embedding_dim)
 39         # [100] => [256]
 40         self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=2,
 41                            bidirectional=True, dropout=0.5)
 42         # [256*2] => [1]
 43         self.fc = nn.Linear(hidden_dim * 2, 1)
 44         self.dropout = nn.Dropout(0.5)
 45 
 46     def forward(self, x):
 47         """
 48         x: [seq_len, b] vs [b, 3, 28, 28]
 49         """
 50         # [seq, b, 1] => [seq, b, 100]
 51         embedding = self.dropout(self.embedding(x))
 52 
 53         # output: [seq, b, hid_dim*2]
 54         # hidden/h: [num_layers*2, b, hid_dim]
 55         # cell/c: [num_layers*2, b, hid_di]
 56         output, (hidden, cell) = self.rnn(embedding)
 57 
 58         # [num_layers*2, b, hid_dim] => 2 of [b, hid_dim] => [b, hid_dim*2]
 59         hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
 60 
 61         # [b, hid_dim*2] => [b, 1]
 62         hidden = self.dropout(hidden)
 63         out = self.fc(hidden)
 64 
 65         return out
 66 
 67 
 68 rnn = RNN(len(TEXT.vocab), 100, 256)
 69 
 70 pretrained_embedding = TEXT.vocab.vectors
 71 print('pretrained_embedding:', pretrained_embedding.shape)
 72 rnn.embedding.weight.data.copy_(pretrained_embedding)
 73 print('embedding layer inited.')
 74 
 75 optimizer = optim.Adam(rnn.parameters(), lr=1e-3)
 76 criteon = nn.BCEWithLogitsLoss()
 77 # rnn.to(device)
 78 
 79 
 80 def binary_acc(preds, y):
 81     """
 82     get accuracy
 83     """
 84     preds = torch.round(torch.sigmoid(preds))
 85     correct = torch.eq(preds, y).float()
 86     acc = correct.sum() / len(correct)
 87     return acc
 88 
 89 
 90 def train(rnn, iterator, optimizer, criteon):
 91     avg_acc = []
 92     rnn.train()
 93 
 94     for i, batch in enumerate(iterator):
 95 
 96         # [seq, b] => [b, 1] => [b]
 97         pred = rnn(batch.text).squeeze(1)
 98         #
 99         loss = criteon(pred, batch.label)
100         acc = binary_acc(pred, batch.label).item()
101         avg_acc.append(acc)
102 
103         optimizer.zero_grad()
104         loss.backward()
105         optimizer.step()
106 
107         if i % 10 == 0:
108             print(i, acc)
109 
110     avg_acc = np.array(avg_acc).mean()
111     print('avg acc:', avg_acc)
112 
113 
114 def eval(rnn, iterator, criteon):
115     avg_acc = []
116 
117     rnn.eval()
118 
119     with torch.no_grad():
120         for batch in iterator:
121             # [b, 1] => [b]
122             pred = rnn(batch.text).squeeze(1)
123             loss = criteon(pred, batch.label)
124 
125             acc = binary_acc(pred, batch.label).item()
126             avg_acc.append(acc)
127 
128     avg_acc = np.array(avg_acc).mean()
129 
130     print('>>test:', avg_acc)
131 
132 
133 for epoch in range(10):
134     eval(rnn, test_iterator, criteon)
135     train(rnn, train_iterator, optimizer, criteon)
View Code

 

posted @ 2020-07-31 19:55  小吴的日常  阅读(191)  评论(0)    收藏  举报