DQN算法

算法图

"""
The Dueling DQN based on this paper: https://arxiv.org/abs/1511.06581

View more on my tutorial page: https://morvanzhou.github.io/tutorials/

Using:
Tensorflow: 1.0
gym: 0.8.0
"""

import numpy as np
import tensorflow as tf

np.random.seed(3)
tf.set_random_seed(1)


class DuelingDQN:
#初始化各项参数  
  def __init__(
          self,
          n_actions,
          n_features,
          learning_rate=0.001,
          reward_decay=0.9,
          e_greedy=0.9,
          replace_target_iter=200,
          memory_size=500,
          batch_size=32,
          e_greedy_increment=None,
          output_graph=True,
          dueling=True,
          n_point=20,
          sess=None,
  ):
      self.n_actions = n_actions #agent的行为
      self.n_features = n_features #特征,作为深度神经网络的输入
      self.lr = learning_rate #学习率
      self.gamma = reward_decay #Bellman Fuction中的gamma,意思是回报的折损率
      self.epsilon_max = e_greedy #最大探索率
      self.replace_target_iter = replace_target_iter#每200次迭代就用输出替代目标函数
      self.memory_size = memory_size#replay的内存大小
      self.batch_size = batch_size#批大小,神经网络的训练都是一批一批的,这个变量指的就是每批的大小
      self.n_points=n_point#代表20个接入点?
      self.epsilon_increment = e_greedy_increment#和探索策略有关的变量
      self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max##探索策略是eplison greedy,在每一个状态有e_greedy的概率采取Q值最大的行为,有1-e_greedy的概率采取其他行为。与之相对的探索策略为Boltzman Exploration

      self.dueling = dueling      # decide to use dueling DQN or not是否使用DQN技巧

      self.learn_step_counter = 0#学习迭代次数计数器
      self.memory = np.zeros((self.memory_size, n_features*2+2))#存储的形状为[replay内存大小,特征*2+2]
      self._build_net()#定义构建网络
      t_params = tf.get_collection('target_net_params')#目标网络参数为从'target_net_params'这个集合中返回的参数
      e_params = tf.get_collection('eval_net_params')#评估网络参数为从‘eval_net_params’这个集合中返回的参数
      self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]#zip()函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表

      if sess is None:#tensorflow是否开启了会话,只有开启了会话才能够访问变量
          self.sess = tf.Session()
          #进行全局变量初始化
          self.sess.run(tf.global_variables_initializer())
      else:
          self.sess = sess

      if output_graph:
          self.merged_loss = tf.summary.merge([self.aa])
          self.merged_W = tf.summary.merge([self.log_Q1,self.log_Q2,self.log_Q3,self.log_Q4,self.log_Q5])
          self.log=1
          self.writer = tf.summary.FileWriter("logs/", self.sess.graph)
          print('log created!')
      else:
          self.log=0
      self.cost_his = []

#本函数的目的为构建网络
  def _build_net(self):
      def build_layers(s, c_names, n_l1, w_initializer, b_initializer):#构建神经层
          with tf.variable_scope('l1'):#在l1的变量作用域下
              w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)#初始化权值矩阵,形状为[输入特征,神经元个数]
              b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)#初始化偏置向量,形状为[1,神经元个数]
              l1 = tf.nn.relu(tf.matmul(s, w1) + b1)#使用relu函数作为激活函数,计算该层函数输出relu(s*w1+b1)
        #  with tf.variable_scope('l2'):
         #     w2 = tf.get_variable('w2', [n_l1, n_l1], initializer=w_initializer, collections=c_names)
          #    b2 = tf.get_variable('b2', [1, n_l1], initializer=b_initializer, collections=c_names)
           #   l2 = tf.nn.relu(tf.matmul(l1, w2) + b2)
         # with tf.variable_scope('l3'):
          #    w3 = tf.get_variable('w3', [n_l1, n_l1], initializer=w_initializer, collections=c_names)
           #   b3 = tf.get_variable('b3', [1, n_l1], initializer=b_initializer, collections=c_names)
            #  l3 = tf.nn.relu(tf.matmul(l2, w3) + b3)

          if self.dueling:
              # Dueling DQN
              with tf.variable_scope('Value'):
                  wV = tf.get_variable('wV', [n_l1, 1], initializer=w_initializer, collections=c_names)
                  bV = tf.get_variable('bV', [1, 1], initializer=b_initializer, collections=c_names)
                  self.V = tf.matmul(l1, wV) + bV

              with tf.variable_scope('Advantage'):
                  wA = tf.get_variable('wA', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                  bA = tf.get_variable('bA', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                  self.A = tf.matmul(l1, wA) + bA

              with tf.variable_scope('Q'): #合并V和A,为了不让A直接学成Q,减去了一个A的均值(相当于一种约束,让A是所有值和为零)
                  out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True))     # Q = V(s) + A(s,a)

          else:#不使用dueling的情况
              with tf.variable_scope('Q'):
                  w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)#初始化权值矩阵,形状为[第一层的输出,行为数]
                  b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)#初始化偏置向量,形状为[1,行为数]
                  out = tf.matmul(l1, w2) + b2#输出为l1*w2+b2

          return out

      # ------------------ build evaluate_net ------------------
      #构建策略评估网络(policy evaluation)
      self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
      self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
      with tf.variable_scope('eval_net'):
          c_names, n_l1, w_initializer, b_initializer = \
              ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], self.n_points, \
              tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers
           #配置参数,权重被配置为0到0.3之间的正态分布,偏置被配置为0.1

          self.q_eval = build_layers(self.s, c_names, n_l1, w_initializer, b_initializer)#q评估网络的配置
          #self.max_q=np.max(self.q_eval)
          #self.min_q=np.min(self.q_eval)
          self.log_Q1 = tf.summary.scalar('Q_max', (tf.reduce_max(self.q_eval)))
          self.log_Q2 = tf.summary.scalar('Q_min', (tf.reduce_min(self.q_eval)))
          self.log_Q3 = tf.summary.scalar('Q_mean', (tf.reduce_mean(self.q_eval)))
          self.log_Q4 = tf.summary.scalar('Q_d', (tf.reduce_max(self.q_eval)-tf.reduce_min(self.q_eval)))
          self.log_Q5 = tf.summary.scalar('Q_def', (tf.reduce_max(self.q_eval) - tf.reduce_min(self.q_eval))/tf.reduce_mean(abs(self.q_eval)))
          #self.log_Q2 = tf.summary.scalar('Q2', self.min_q)


      with tf.variable_scope('loss'):#在loss作用域下
          self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))#先计算q_target和q_eval的元素差的平方,然后计算均值,作为loss来使用
          self.aa=tf.summary.scalar('loss', self.loss)

      with tf.variable_scope('train'):#在train作用域下
          self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)#Adam优化方法
         # self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

      # ------------------ build target_net ------------------
      #构建目标网络
      self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')    # input
      with tf.variable_scope('target_net'):
          c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
          self.q_next = build_layers(self.s_, c_names, n_l1, w_initializer, b_initializer)

  def store_transition(self, s, a, r, s_):#存储经验
      if not hasattr(self, 'memory_counter'):#如果之前没有定义memory_counter,则现在定义一个值为0的memory_counter
          self.memory_counter = 0
      transition = np.hstack((s, [a, r], s_))#horizontal stack水平方向堆叠[状态,[行为,奖励],下一个状态],也就是说,存的是过去的经验
      index = self.memory_counter % self.memory_size#索引等于存储计数模存储容量
      self.memory[index, :] = transition#切片存储
      self.memory_counter += 1#每次调用完该函数后计数器+1

  #epsilon-greedy算法:选择行为的依据为,产生一个0到1的随机数,如果该数小于epsilon,选择该状态下使得动作-状态
  #值函数最大的那个行为,如果大于epsilon,则随机选择一个行为。
  def choose_action(self, observation):#选择行为
      observation = observation[np.newaxis, :]#给observation增加一个维度
      if np.random.uniform() < self.epsilon:#探索策略
          if (self.log):
              #feed_dict的作用是给使用placeholder创建出来的tensor赋值
              actions_value,summary = self.sess.run([self.q_eval,self.merged_W], feed_dict={self.s: observation})#运行评估q网络,输入为observation
              self.writer.add_summary(summary, global_step=self.learn_step_counter)
          else:
              actions_value= self.sess.run(self.q_eval, feed_dict={self.s: observation})#运行评估q网络,输入为observation

          action = np.argmax(actions_value)#返回数组元素中最大元素的索引值
      else:
          action = np.random.randint(0, self.n_actions)#随机产生一个行为
      return action#返回选择的行为



#我的理解,每次学习迭代时,从整个memory中取一个batch下来,对这个batch中所有的q使用BellmanEquation进行更新。并且,在最初的学习时,
#探索率比较小,意味着在选择行为时倾向于选择那些让q值很大的行为,随着学习的进行,探索率逐渐加到一个最大值之后便不变了,这意味着选择行为
#进行探索更随机了
  def learn(self):#学习
      if self.learn_step_counter % self.replace_target_iter == 0:#如果刚好学习的迭代次数为替换次数整数倍,则替换目标网络
          self.sess.run(self.replace_target_op)
          print('\ntarget_params_replaced\n')#打印目标参数被替代

      sample_index = np.random.choice(self.memory_size, size=self.batch_size)#从memory_size中随机采样batch_size个整数作为采样的索引
      batch_memory = self.memory[sample_index, :]#批存储
     # print(tmp)
      #print(tmp[0])
      #batch_memory,range,min=self.autoNorm(tmp)
      #print(batch_memory)
      q_next = self.sess.run(self.q_next, feed_dict={self.s_: batch_memory[:, -self.n_features:]}) # next observation
      q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]})

      q_target = q_eval.copy()

      batch_index = np.arange(self.batch_size, dtype=np.int32)#批索引
      eval_act_index = batch_memory[:, self.n_features].astype(int)#评估动作索引
      reward = batch_memory[:, self.n_features + 1]#即时奖励

      q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)#动作价值函数的Bellman Equation

      if(self.log):
          _, summary,self.cost = self.sess.run([self._train_op,self.merged_loss ,self.loss],
                                           feed_dict={self.s: batch_memory[:, :self.n_features],
                                                      self.q_target: q_target})
          self.writer.add_summary(summary, global_step=self.learn_step_counter)
      else:
          _, self.cost = self.sess.run([self._train_op, self.loss],
                                   feed_dict={self.s: batch_memory[:, :self.n_features],
                                              self.q_target: q_target})

      self.cost_his.append(self.cost)

      self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max#是否增加探索因子
      self.learn_step_counter += 1#学习步数加1

  def plot_cost(self):#画出cost函数
      import matplotlib.pyplot as plt
      if self.log:
          self.writer.close()
      #self.writer.add_summary(rs, self.learn_step_counter)
      plt.plot(np.arange(len(self.cost_his)), self.cost_his)
      plt.ylabel('Cost')
      plt.xlabel('training steps')
      plt.show()




posted @ 2021-08-19 09:49  智子lock  阅读(328)  评论(0)    收藏  举报