1 # coding=utf-8
2
3 import sys
4 import os
5 import gym
6 import pylab
7 import random
8 import numpy as np
9 from collections import deque
10 from keras.layers import Dense
11 from keras.optimizers import Adam
12 from keras.models import Sequential
13 from keras.callbacks import Callback
14 import matplotlib.pyplot as plt
15
16 class LossHistory(Callback):
17 def on_train_begin(self, logs={}):
18 self.losses = []
19
20 def on_batch_end(self, batch, logs={}):
21 self.losses.append(logs.get('loss'))
22
23 class DQNAgent:
24 def __init__(self, state_size, action_size,
25 render=False, load_model=False,
26 gamma=0.99, learning_rate=0.001,
27 epsilon=1.0, epsilon_decay=0.999,
28 epsilon_min=0.01, batch_size=64,
29 train_start=100, memory_size=2000,
30 ):
31 # env 的状态空间的设置
32 self.state_size = state_size
33 self.action_size = action_size
34
35 # render表示是否打开gym下的动画展示,打开的话运行速度会大幅减慢
36 self.render = render
37 # load_model=True表示从文件中加载model
38 self.load_model = load_model
39
40 # 接下来的都是DQN的超参
41 self.gamma = gamma
42 self.learning_rate = learning_rate
43 self.epsilon = epsilon
44 self.epsilon_decay = epsilon_decay # e-贪心的e值随着步骤不断减小的比例
45 self.epsilon_min = epsilon_min # e-贪心的e值减小到一个阈值不再减小
46
47 self.train_start = train_start
48 self.batch_size = batch_size
49
50 # 记忆数据存储模块
51 self.memory = deque(maxlen=memory_size)
52
53 # 初始化模型
54 self.model = self.build_model()
55
56 # 记录损失值
57 self.history = LossHistory()
58 self.losses_list = []
59
60 def build_model(self, units=128):
61 model = Sequential()
62 model.add(Dense(units, input_dim=self.state_size,
63 activation='sigmoid', kernel_initializer='he_uniform'))
64 model.add(Dense(units, activation='sigmoid',
65 kernel_initializer='he_uniform'))
66 model.add(Dense(self.action_size, activation='linear',
67 kernel_initializer='he_uniform'))
68 model.summary()
69
70 model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate))
71 return model
72
73 def choose_action(self, state):
74 if np.random.rand() <= self.epsilon:
75 return random.randrange(self.action_size)
76 else:
77 q_value = self.model.predict(state)
78 return np.argmax(q_value[0])
79
80 def add_memory(self, state, action, reward, done, next_state):
81 self.memory.append((state, action, reward, done, next_state))
82 if self.epsilon > self.epsilon_min:
83 self.epsilon *= self.epsilon_decay
84
85 def train_model(self):
86 if len(self.memory) < self.train_start:
87 return
88 batch_size = min(self.batch_size, len(self.memory))
89 min_batch = random.sample(self.memory, batch_size)
90
91 update_input = np.zeros((batch_size, self.state_size))
92 update_target = np.zeros((batch_size, self.state_size))
93 action, reward, done = [], [], []
94
95 for i in range(batch_size):
96 update_input[i] = min_batch[i][0]
97 action.append(min_batch[i][1])
98 reward.append(min_batch[i][2])
99 done.append(min_batch[i][3])
100 update_target[i] = min_batch[i][4]
101
102 target = self.model.predict(update_input, batch_size=batch_size)
103 target_val = self.model.predict(update_target, batch_size=batch_size)
104
105 for i in range(self.batch_size):
106 if done[i]:
107 target[i][action[i]] = reward[i]
108 else:
109 target[i][action[i]] = reward[i] + self.gamma * np.amax(target_val[i])
110
111 self.model.fit(update_input, target, batch_size=batch_size, epochs=1, verbose=0, callbacks=[self.history])
112 self.losses_list.append(self.history.losses[0])
113
114
115 def draw_score_plot(scores, filename='graph.png'):
116 fig = plt.figure()
117 ax1 = fig.add_subplot(1, 1, 1)
118 ax1.set_title('mean score')
119 ax1.plot(range(len(scores)), scores, color='blue')
120 plt.savefig(filename)
121
122
123 def draw_plot(scores, losses, filename='graph.png'):
124 fig = plt.figure()
125 ax1 = fig.add_subplot(1, 2, 1)
126 ax1.set_title('mean score')
127 ax1.plot(range(len(scores)), scores, color='blue')
128
129
130 ax2 = fig.add_subplot(1, 2, 2)
131 ax2.set_title('mean loss-reward')
132 ax2.plot(range(len(losses)), losses, color='blue')
133 plt.savefig(filename)