RNN例子解释前向传导和后向反向传播
1 import numpy as np 2 from datetime import datetime 3 import sys 4 5 from numpy.core.fromnumeric import shape 6 7 class RNN: 8 def __init__(self, word_dim, hidden_dim=100, bptt_truncate=4): 9 self.word_dim = word_dim 10 self.hidden_dim = hidden_dim 11 self.bptt_truncate = bptt_truncate 12 # Randomly initialize the network parameters, np.random.uniform(low,high,size=(m,n)) -> matrix: m * n 13 self.U = np.random.uniform(-np.sqrt(1./word_dim), np.sqrt(1./word_dim), (hidden_dim, word_dim)) 14 self.V = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (word_dim, hidden_dim)) 15 self.W = np.random.uniform(-np.sqrt(1./hidden_dim), np.sqrt(1./hidden_dim), (hidden_dim, hidden_dim)) 16 17 def softmax(self,x): 18 exp_x = np.exp(x) 19 softmax_x = exp_x / np.sum(exp_x) 20 return softmax_x 21 22 def forward_propagation(self, x): 23 # hidden states is h, prediction is y_hat 24 T = len(x) 25 h = np.zeros((T + 1, self.hidden_dim)) 26 h[-1] = np.zeros(self.hidden_dim) 27 y_hat = np.zeros((T, self.word_dim)) 28 # For each time step... 29 for t in np.arange(T): 30 x_t = np.array(x[t]).reshape(-1,1) 31 h[t] = (self.U.dot(x_t) + self.W.dot(h[t-1].reshape(-1,1))).reshape(-1) 32 o_t = self.V.dot(h[t]) 33 y_hat[t] = self.softmax(o_t) 34 return y_hat, h 35 36 def predict(self, x): 37 # Perform forward propagation and return index of the highest score 38 y, h = self.forward_propagation(x) 39 return np.argmax(y, axis=1) 40 41 def calculate_total_loss(self, x, labels): 42 total_L = 0 43 # For each sentence... 44 for i in np.arange(len(labels)): 45 y_hat, h = self.forward_propagation(x[i]) 46 total_L += -1 * sum([np.log(y_pred.T.dot(y_true)) for y_pred,y_true in zip(y_hat,np.array(labels[i]))]) 47 return total_L 48 49 def calculate_loss(self, x, labels): 50 # Divide the total loss by the number of training examples 51 N = np.sum([len(label_i) for label_i in labels]) 52 return self.calculate_total_loss(x,labels)/N 53 54 def bptt(self, x, label): 55 T = len(label) 56 # Perform forward propagation 57 y_hat, h = self.forward_propagation(x) 58 # We accumulate the gradients in these variables 59 dLdU = np.zeros(self.U.shape) 60 dLdV = np.zeros(self.V.shape) 61 dLdW = np.zeros(self.W.shape) 62 # delta_y -> dLdy: y_hat_t - y_t 63 delta_y = np.zeros(y_hat.shape) 64 # For each output backwards... 65 for t in np.arange(T - 1,-1,-1): 66 delta_y[t] = y_hat[t] - np.array(label[t]) 67 dLdV += delta_y[t].reshape(-1,1) @ h[t].T.reshape(1,-1) 68 # Initial delta_t calculation when t is T 69 if t == T - 1: 70 delta_t = np.diag(1 - np.power(h[t],2)) @ self.V.T @ delta_y[t].reshape(-1,1) 71 else: 72 delta_t = np.diag(1 - np.power(h[t],2)) @ (self.V.T @ delta_y[t].reshape(-1,1) + self.W.T @ delta_t.reshape(-1,1)) 73 dLdW += delta_t @ h[t - 1].reshape(1,-1) 74 dLdU += delta_t @ np.array(x[t]).reshape(1,-1) 75 return dLdU, dLdV, dLdW 76 77 # Performs one step of SGD. 78 def numpy_sdg_step(self, x, label, learning_rate): 79 # Calculate the gradients 80 dLdU, dLdV, dLdW = self.bptt(x, label) 81 # Change parameters according to gradients and learning rate 82 self.U -= learning_rate * dLdU 83 self.V -= learning_rate * dLdV 84 self.W -= learning_rate * dLdW 85 86 # - model: The RNN model instance 87 # - X_train: The training data set 88 # - y_train: The training data labels 89 # - learning_rate: Initial learning rate for SGD 90 # - nepoch: Number of times to iterate through the complete dataset 91 # - evaluate_loss_after: Evaluate the loss after this many epochs 92 def train_with_sgd(model, X_train, y_train, learning_rate=0.005, nepoch=100, evaluate_loss_after=5): 93 # We keep track of the losses so we can plot them later 94 losses = [] 95 num_examples_seen = 0 96 for epoch in range(nepoch): 97 # Optionally evaluate the loss 98 if (epoch % evaluate_loss_after == 0): 99 loss = model.calculate_loss(X_train, y_train) 100 losses.append((num_examples_seen, loss)) 101 time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 102 print(f'{time} Loss after num_examples_seen {num_examples_seen} epoch {epoch}, current loss is {loss}') 103 # Adjust the learning rate if loss increases 104 if(len(losses)>1 and losses[-1][1]>losses[-2][1]): 105 learning_rate = learning_rate * 0.5 106 print("Setting learning rate to %f" % learning_rate) 107 108 # For each training example... 109 for i in range(len(y_train)): 110 # One SGD step 111 model.numpy_sdg_step(X_train[i], y_train[i], learning_rate) 112 num_examples_seen += 1 113 114 if __name__=='__main__': 115 s1 = '你 好 李 焕 英' 116 s2 = '夏 洛 特 烦 恼' 117 vocab_size= len(s1.split(' ')) + len(s2.split(' ')) 118 vocab = [[0] * vocab_size for _ in range(vocab_size)] 119 for i in range(vocab_size): vocab[i][i] = 1 120 x_sample = [vocab[:5]] + [vocab[5:]] 121 labels = [vocab[1:6]] + [vocab[6:]+[vocab[0]]] 122 123 rnn = RNN(10) 124 train_with_sgd(rnn,x_sample,labels)
每一个不曾起舞的日子,都是对生命的辜负。