
"""
The Dueling DQN based on this paper: https://arxiv.org/abs/1511.06581
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: 1.0
gym: 0.8.0
"""
import numpy as np
import tensorflow as tf
np.random.seed(3)
tf.set_random_seed(1)
class DuelingDQN:
#初始化各项参数
def __init__(
self,
n_actions,
n_features,
learning_rate=0.001,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=200,
memory_size=500,
batch_size=32,
e_greedy_increment=None,
output_graph=True,
dueling=True,
n_point=20,
sess=None,
):
self.n_actions = n_actions #agent的行为
self.n_features = n_features #特征,作为深度神经网络的输入
self.lr = learning_rate #学习率
self.gamma = reward_decay #Bellman Fuction中的gamma,意思是回报的折损率
self.epsilon_max = e_greedy #最大探索率
self.replace_target_iter = replace_target_iter#每200次迭代就用输出替代目标函数
self.memory_size = memory_size#replay的内存大小
self.batch_size = batch_size#批大小,神经网络的训练都是一批一批的,这个变量指的就是每批的大小
self.n_points=n_point#代表20个接入点?
self.epsilon_increment = e_greedy_increment#和探索策略有关的变量
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max##探索策略是eplison greedy,在每一个状态有e_greedy的概率采取Q值最大的行为,有1-e_greedy的概率采取其他行为。与之相对的探索策略为Boltzman Exploration
self.dueling = dueling # decide to use dueling DQN or not是否使用DQN技巧
self.learn_step_counter = 0#学习迭代次数计数器
self.memory = np.zeros((self.memory_size, n_features*2+2))#存储的形状为[replay内存大小,特征*2+2]
self._build_net()#定义构建网络
t_params = tf.get_collection('target_net_params')#目标网络参数为从'target_net_params'这个集合中返回的参数
e_params = tf.get_collection('eval_net_params')#评估网络参数为从‘eval_net_params’这个集合中返回的参数
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]#zip()函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表
if sess is None:#tensorflow是否开启了会话,只有开启了会话才能够访问变量
self.sess = tf.Session()
#进行全局变量初始化
self.sess.run(tf.global_variables_initializer())
else:
self.sess = sess
if output_graph:
self.merged_loss = tf.summary.merge([self.aa])
self.merged_W = tf.summary.merge([self.log_Q1,self.log_Q2,self.log_Q3,self.log_Q4,self.log_Q5])
self.log=1
self.writer = tf.summary.FileWriter("logs/", self.sess.graph)
print('log created!')
else:
self.log=0
self.cost_his = []
#本函数的目的为构建网络
def _build_net(self):
def build_layers(s, c_names, n_l1, w_initializer, b_initializer):#构建神经层
with tf.variable_scope('l1'):#在l1的变量作用域下
w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)#初始化权值矩阵,形状为[输入特征,神经元个数]
b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)#初始化偏置向量,形状为[1,神经元个数]
l1 = tf.nn.relu(tf.matmul(s, w1) + b1)#使用relu函数作为激活函数,计算该层函数输出relu(s*w1+b1)
# with tf.variable_scope('l2'):
# w2 = tf.get_variable('w2', [n_l1, n_l1], initializer=w_initializer, collections=c_names)
# b2 = tf.get_variable('b2', [1, n_l1], initializer=b_initializer, collections=c_names)
# l2 = tf.nn.relu(tf.matmul(l1, w2) + b2)
# with tf.variable_scope('l3'):
# w3 = tf.get_variable('w3', [n_l1, n_l1], initializer=w_initializer, collections=c_names)
# b3 = tf.get_variable('b3', [1, n_l1], initializer=b_initializer, collections=c_names)
# l3 = tf.nn.relu(tf.matmul(l2, w3) + b3)
if self.dueling:
# Dueling DQN
with tf.variable_scope('Value'):
wV = tf.get_variable('wV', [n_l1, 1], initializer=w_initializer, collections=c_names)
bV = tf.get_variable('bV', [1, 1], initializer=b_initializer, collections=c_names)
self.V = tf.matmul(l1, wV) + bV
with tf.variable_scope('Advantage'):
wA = tf.get_variable('wA', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
bA = tf.get_variable('bA', [1, self.n_actions], initializer=b_initializer, collections=c_names)
self.A = tf.matmul(l1, wA) + bA
with tf.variable_scope('Q'): #合并V和A,为了不让A直接学成Q,减去了一个A的均值(相当于一种约束,让A是所有值和为零)
out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a)
else:#不使用dueling的情况
with tf.variable_scope('Q'):
w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)#初始化权值矩阵,形状为[第一层的输出,行为数]
b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)#初始化偏置向量,形状为[1,行为数]
out = tf.matmul(l1, w2) + b2#输出为l1*w2+b2
return out
# ------------------ build evaluate_net ------------------
#构建策略评估网络(policy evaluation)
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss
with tf.variable_scope('eval_net'):
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], self.n_points, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
#配置参数,权重被配置为0到0.3之间的正态分布,偏置被配置为0.1
self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)#q评估网络的配置
#self.max_q=np.max(self.q_eval)
#self.min_q=np.min(self.q_eval)
self.log_Q1 = tf.summary.scalar('Q_max', (tf.reduce_max(self.q_eval)))
self.log_Q2 = tf.summary.scalar('Q_min', (tf.reduce_min(self.q_eval)))
self.log_Q3 = tf.summary.scalar('Q_mean', (tf.reduce_mean(self.q_eval)))
self.log_Q4 = tf.summary.scalar('Q_d', (tf.reduce_max(self.q_eval)-tf.reduce_min(self.q_eval)))
self.log_Q5 = tf.summary.scalar('Q_def', (tf.reduce_max(self.q_eval) - tf.reduce_min(self.q_eval))/tf.reduce_mean(abs(self.q_eval)))
#self.log_Q2 = tf.summary.scalar('Q2', self.min_q)
with tf.variable_scope('loss'):#在loss作用域下
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))#先计算q_target和q_eval的元素差的平方,然后计算均值,作为loss来使用
self.aa=tf.summary.scalar('loss', self.loss)
with tf.variable_scope('train'):#在train作用域下
self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)#Adam优化方法
# self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
# ------------------ build target_net ------------------
#构建目标网络
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input
with tf.variable_scope('target_net'):
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)
def store_transition(self, s, a, r, s_):#存储经验
if not hasattr(self, 'memory_counter'):#如果之前没有定义memory_counter,则现在定义一个值为0的memory_counter
self.memory_counter = 0
transition = np.hstack((s, [a, r], s_))#horizontal stack水平方向堆叠[状态,[行为,奖励],下一个状态],也就是说,存的是过去的经验
index = self.memory_counter % self.memory_size#索引等于存储计数模存储容量
self.memory[index, :] = transition#切片存储
self.memory_counter += 1#每次调用完该函数后计数器+1
#epsilon-greedy算法:选择行为的依据为,产生一个0到1的随机数,如果该数小于epsilon,选择该状态下使得动作-状态
#值函数最大的那个行为,如果大于epsilon,则随机选择一个行为。
def choose_action(self, observation):#选择行为
observation = observation[np.newaxis, :]#给observation增加一个维度
if np.random.uniform() < self.epsilon:#探索策略
if (self.log):
#feed_dict的作用是给使用placeholder创建出来的tensor赋值
actions_value,summary = self.sess.run([self.q_eval,self.merged_W], feed_dict={self.s: observation})#运行评估q网络,输入为observation
self.writer.add_summary(summary, global_step=self.learn_step_counter)
else:
actions_value= self.sess.run(self.q_eval, feed_dict={self.s: observation})#运行评估q网络,输入为observation
action = np.argmax(actions_value)#返回数组元素中最大元素的索引值
else:
action = np.random.randint(0, self.n_actions)#随机产生一个行为
return action#返回选择的行为
#我的理解,每次学习迭代时,从整个memory中取一个batch下来,对这个batch中所有的q使用BellmanEquation进行更新。并且,在最初的学习时,
#探索率比较小,意味着在选择行为时倾向于选择那些让q值很大的行为,随着学习的进行,探索率逐渐加到一个最大值之后便不变了,这意味着选择行为
#进行探索更随机了
def learn(self):#学习
if self.learn_step_counter % self.replace_target_iter == 0:#如果刚好学习的迭代次数为替换次数整数倍,则替换目标网络
self.sess.run(self.replace_target_op)
print('\ntarget_params_replaced\n')#打印目标参数被替代
sample_index = np.random.choice(self.memory_size, size=self.batch_size)#从memory_size中随机采样batch_size个整数作为采样的索引
batch_memory = self.memory[sample_index, :]#批存储
# print(tmp)
#print(tmp[0])
#batch_memory,range,min=self.autoNorm(tmp)
#print(batch_memory)
q_next = self.sess.run(self.q_next, feed_dict={self.s_: batch_memory[:, -self.n_features:]}) # next observation
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)#批索引
eval_act_index = batch_memory[:, self.n_features].astype(int)#评估动作索引
reward = batch_memory[:, self.n_features + 1]#即时奖励
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)#动作价值函数的Bellman Equation
if(self.log):
_, summary,self.cost = self.sess.run([self._train_op,self.merged_loss ,self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.writer.add_summary(summary, global_step=self.learn_step_counter)
else:
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.n_features],
self.q_target: q_target})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max#是否增加探索因子
self.learn_step_counter += 1#学习步数加1
def plot_cost(self):#画出cost函数
import matplotlib.pyplot as plt
if self.log:
self.writer.close()
#self.writer.add_summary(rs, self.learn_step_counter)
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()