清风

导航

Andrej Karpathy的char-rnn Python3版本

  1 """
  2 Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
  3 BSD License
  4 """
  5 import numpy as np
  6     
  7 # data I/O
  8 data = open('input.txt', 'r', encoding='utf-8').read() # should be simple plain text file
  9 chars = list(set(data))
 10 data_size, vocab_size = len(data), len(chars)
 11 print('data has %d characters, %d unique.' % (data_size, vocab_size))
 12 char_to_ix = { ch:i for i,ch in enumerate(chars) }
 13 ix_to_char = { i:ch for i,ch in enumerate(chars) }
 14 
 15 # hyperparameters
 16 hidden_size = 100 # size of hidden layer of neurons
 17 seq_length = 25 # number of steps to unroll the RNN for
 18 learning_rate = 1e-1
 19 
 20 # model parameters
 21 Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
 22 Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
 23 Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
 24 bh = np.zeros((hidden_size, 1)) # hidden bias
 25 by = np.zeros((vocab_size, 1)) # output bias
 26 
 27 def lossFun(inputs, targets, hprev):
 28     """
 29     inputs,targets are both list of integers.
 30     hprev is Hx1 array of initial hidden state
 31     returns the loss, gradients on model parameters, and last hidden state
 32     """
 33     xs, hs, ys, ps = {}, {}, {}, {}
 34     hs[-1] = np.copy(hprev)
 35     loss = 0
 36     # forward pass
 37     for t in range(len(inputs)):
 38         xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
 39         xs[t][inputs[t]] = 1
 40         hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
 41         ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
 42         ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
 43         loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
 44     # backward pass: compute gradients going backwards
 45     dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
 46     dbh, dby = np.zeros_like(bh), np.zeros_like(by)
 47     dhnext = np.zeros_like(hs[0])
 48     for t in reversed(range(len(inputs))):
 49         dy = np.copy(ps[t])
 50         dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
 51         dWhy += np.dot(dy, hs[t].T)
 52         dby += dy
 53         dh = np.dot(Why.T, dy) + dhnext # backprop into h
 54         dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
 55         dbh += dhraw
 56         dWxh += np.dot(dhraw, xs[t].T)
 57         dWhh += np.dot(dhraw, hs[t-1].T)
 58         dhnext = np.dot(Whh.T, dhraw)
 59     for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
 60         np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
 61     return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
 62 
 63 def sample(h, seed_ix, n):
 64     """ 
 65     sample a sequence of integers from the model 
 66     h is memory state, seed_ix is seed letter for first time step
 67     """
 68     x = np.zeros((vocab_size, 1))
 69     x[seed_ix] = 1
 70     ixes = []
 71     for t in range(n):
 72         h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
 73         y = np.dot(Why, h) + by
 74         p = np.exp(y) / np.sum(np.exp(y))
 75         ix = np.random.choice(list(range(vocab_size)), p=p.ravel())
 76         x = np.zeros((vocab_size, 1))
 77         x[ix] = 1
 78         ixes.append(ix)
 79     return ixes
 80 
 81 n, p = 0, 0
 82 mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
 83 mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
 84 smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
 85 while True:
 86     # prepare inputs (we're sweeping from left to right in steps seq_length long)
 87     if p+seq_length+1 >= len(data) or n == 0: 
 88         hprev = np.zeros((hidden_size,1)) # reset RNN memory
 89         p = 0 # go from start of data
 90     inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
 91     targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
 92 
 93     # sample from the model now and then
 94     if n % 100 == 0:
 95         sample_ix = sample(hprev, inputs[0], 200)
 96         txt = ''.join(ix_to_char[ix] for ix in sample_ix)
 97         print('----\n %s \n----' % (txt, ))
 98 
 99     # forward seq_length characters through the net and fetch gradient
100     loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
101     smooth_loss = smooth_loss * 0.999 + loss * 0.001
102     if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
103     
104     # perform parameter update with Adagrad
105     for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
106                                     [dWxh, dWhh, dWhy, dbh, dby], 
107                                     [mWxh, mWhh, mWhy, mbh, mby]):
108         mem += dparam * dparam
109         param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
110 
111     p += seq_length # move data pointer
112     n += 1 # iteration counter 

 

posted on 2020-12-25 01:01  清风2009  阅读(137)  评论(0编辑  收藏  举报