1 """
2 Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
3 BSD License
4 """
5 import numpy as np
6
7 # data I/O
8 data = open('input.txt', 'r', encoding='utf-8').read() # should be simple plain text file
9 chars = list(set(data))
10 data_size, vocab_size = len(data), len(chars)
11 print('data has %d characters, %d unique.' % (data_size, vocab_size))
12 char_to_ix = { ch:i for i,ch in enumerate(chars) }
13 ix_to_char = { i:ch for i,ch in enumerate(chars) }
14
15 # hyperparameters
16 hidden_size = 100 # size of hidden layer of neurons
17 seq_length = 25 # number of steps to unroll the RNN for
18 learning_rate = 1e-1
19
20 # model parameters
21 Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
22 Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
23 Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
24 bh = np.zeros((hidden_size, 1)) # hidden bias
25 by = np.zeros((vocab_size, 1)) # output bias
26
27 def lossFun(inputs, targets, hprev):
28 """
29 inputs,targets are both list of integers.
30 hprev is Hx1 array of initial hidden state
31 returns the loss, gradients on model parameters, and last hidden state
32 """
33 xs, hs, ys, ps = {}, {}, {}, {}
34 hs[-1] = np.copy(hprev)
35 loss = 0
36 # forward pass
37 for t in range(len(inputs)):
38 xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
39 xs[t][inputs[t]] = 1
40 hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
41 ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
42 ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
43 loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
44 # backward pass: compute gradients going backwards
45 dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
46 dbh, dby = np.zeros_like(bh), np.zeros_like(by)
47 dhnext = np.zeros_like(hs[0])
48 for t in reversed(range(len(inputs))):
49 dy = np.copy(ps[t])
50 dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
51 dWhy += np.dot(dy, hs[t].T)
52 dby += dy
53 dh = np.dot(Why.T, dy) + dhnext # backprop into h
54 dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
55 dbh += dhraw
56 dWxh += np.dot(dhraw, xs[t].T)
57 dWhh += np.dot(dhraw, hs[t-1].T)
58 dhnext = np.dot(Whh.T, dhraw)
59 for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
60 np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
61 return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]
62
63 def sample(h, seed_ix, n):
64 """
65 sample a sequence of integers from the model
66 h is memory state, seed_ix is seed letter for first time step
67 """
68 x = np.zeros((vocab_size, 1))
69 x[seed_ix] = 1
70 ixes = []
71 for t in range(n):
72 h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
73 y = np.dot(Why, h) + by
74 p = np.exp(y) / np.sum(np.exp(y))
75 ix = np.random.choice(list(range(vocab_size)), p=p.ravel())
76 x = np.zeros((vocab_size, 1))
77 x[ix] = 1
78 ixes.append(ix)
79 return ixes
80
81 n, p = 0, 0
82 mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
83 mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
84 smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
85 while True:
86 # prepare inputs (we're sweeping from left to right in steps seq_length long)
87 if p+seq_length+1 >= len(data) or n == 0:
88 hprev = np.zeros((hidden_size,1)) # reset RNN memory
89 p = 0 # go from start of data
90 inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
91 targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]
92
93 # sample from the model now and then
94 if n % 100 == 0:
95 sample_ix = sample(hprev, inputs[0], 200)
96 txt = ''.join(ix_to_char[ix] for ix in sample_ix)
97 print('----\n %s \n----' % (txt, ))
98
99 # forward seq_length characters through the net and fetch gradient
100 loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
101 smooth_loss = smooth_loss * 0.999 + loss * 0.001
102 if n % 100 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress
103
104 # perform parameter update with Adagrad
105 for param, dparam, mem in zip([Wxh, Whh, Why, bh, by],
106 [dWxh, dWhh, dWhy, dbh, dby],
107 [mWxh, mWhh, mWhy, mbh, mby]):
108 mem += dparam * dparam
109 param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update
110
111 p += seq_length # move data pointer
112 n += 1 # iteration counter