强化学习算法:soft actor-critic (SAC)—— 官方发布的核心代码
完整的官方代码地址如下:
https://openi.pcl.ac.cn/devilmaycry812839668/softlearning

核心代码实现:
点击查看代码
from copy import deepcopy
from collections import OrderedDict
from numbers import Number
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from softlearning.utils.gym import is_continuous_space, is_discrete_space
from .rl_algorithm import RLAlgorithm
@tf.function(experimental_relax_shapes=True)
def td_targets(rewards, discounts, next_values):
return rewards + discounts * next_values
@tf.function(experimental_relax_shapes=True)
def compute_Q_targets(next_Q_values,
next_log_pis,
rewards,
terminals,
discount,
entropy_scale,
reward_scale):
next_values = next_Q_values - entropy_scale * next_log_pis
terminals = tf.cast(terminals, next_values.dtype)
Q_targets = td_targets(
rewards=reward_scale * rewards,
discounts=discount,
next_values=(1.0 - terminals) * next_values)
return Q_targets
def heuristic_target_entropy(action_space):
if is_continuous_space(action_space):
heuristic_target_entropy = -np.prod(action_space.shape)
elif is_discrete_space(action_space):
raise NotImplementedError(
"TODO(hartikainen): implement for discrete spaces.")
else:
raise NotImplementedError((type(action_space), action_space))
return heuristic_target_entropy
class SAC(RLAlgorithm):
"""Soft Actor-Critic (SAC)
References
----------
[1] Tuomas Haarnoja*, Aurick Zhou*, Kristian Hartikainen*, George Tucker,
Sehoon Ha, Jie Tan, Vikash Kumar, Henry Zhu, Abhishek Gupta, Pieter
Abbeel, and Sergey Levine. Soft Actor-Critic Algorithms and
Applications. arXiv preprint arXiv:1812.05905. 2018.
"""
def __init__(
self,
training_environment,
evaluation_environment,
policy,
Qs,
plotter=None,
policy_lr=3e-4,
Q_lr=3e-4,
alpha_lr=3e-4,
reward_scale=1.0,
target_entropy='auto',
discount=0.99,
tau=5e-3,
target_update_interval=1,
save_full_state=False,
Q_targets=None,
**kwargs,
):
"""
Args:
env (`SoftlearningEnv`): Environment used for training.
policy: A policy function approximator.
Qs: Q-function approximators. The min of these
approximators will be used. Usage of at least two Q-functions
improves performance by reducing overestimation bias.
plotter (`QFPolicyPlotter`): Plotter instance to be used for
visualizing Q-function during training.
lr (`float`): Learning rate used for the function approximators.
discount (`float`): Discount factor for Q-function updates.
tau (`float`): Soft value function target update weight.
target_update_interval ('int'): Frequency at which target network
updates occur in iterations.
"""
super(SAC, self).__init__(**kwargs)
self._training_environment = training_environment
self._evaluation_environment = evaluation_environment
self._policy = policy
self._Qs = Qs
if Q_targets is not None:
self._Q_targets = Q_targets
else:
self._Q_targets = tuple(deepcopy(Q) for Q in Qs)
self._update_target(tau=tf.constant(1.0))
self._plotter = plotter
self._policy_lr = policy_lr
self._Q_lr = Q_lr
self._alpha_lr = alpha_lr
self._reward_scale = reward_scale
self._target_entropy = (
heuristic_target_entropy(self._training_environment.action_space)
if target_entropy == 'auto'
else target_entropy)
self._discount = discount
self._tau = tau
self._target_update_interval = target_update_interval
self._save_full_state = save_full_state
self._Q_optimizers = tuple(
tf.optimizers.Adam(
learning_rate=self._Q_lr,
name=f'Q_{i}_optimizer'
) for i, Q in enumerate(self._Qs))
self._policy_optimizer = tf.optimizers.Adam(
learning_rate=self._policy_lr,
name="policy_optimizer")
self._log_alpha = tf.Variable(0.0)
self._alpha = tfp.util.DeferredTensor(self._log_alpha, tf.exp)
self._alpha_optimizer = tf.optimizers.Adam(
self._alpha_lr, name='alpha_optimizer')
@tf.function(experimental_relax_shapes=True)
def _compute_Q_targets(self, batch):
next_observations = batch['next_observations']
rewards = batch['rewards']
terminals = batch['terminals']
entropy_scale = tf.convert_to_tensor(self._alpha)
reward_scale = tf.convert_to_tensor(self._reward_scale)
discount = tf.convert_to_tensor(self._discount)
next_actions, next_log_pis = self._policy.actions_and_log_probs(
next_observations)
next_Qs_values = tuple(
Q.values(next_observations, next_actions) for Q in self._Q_targets)
next_Q_values = tf.reduce_min(next_Qs_values, axis=0)
Q_targets = compute_Q_targets(
next_Q_values,
next_log_pis,
rewards,
terminals,
discount,
entropy_scale,
reward_scale)
return tf.stop_gradient(Q_targets)
@tf.function(experimental_relax_shapes=True)
def _update_critic(self, batch):
"""Update the Q-function.
Creates a `tf.optimizer.minimize` operation for updating
critic Q-function with gradient descent, and appends it to
`self._training_ops` attribute.
See Equations (5, 6) in [1], for further information of the
Q-function update rule.
"""
Q_targets = self._compute_Q_targets(batch)
observations = batch['observations']
actions = batch['actions']
rewards = batch['rewards']
tf.debugging.assert_shapes((
(Q_targets, ('B', 1)), (rewards, ('B', 1))))
Qs_values = []
Qs_losses = []
for Q, optimizer in zip(self._Qs, self._Q_optimizers):
with tf.GradientTape() as tape:
Q_values = Q.values(observations, actions)
Q_losses = 0.5 * (
tf.losses.MSE(y_true=Q_targets, y_pred=Q_values))
Q_loss = tf.nn.compute_average_loss(Q_losses)
gradients = tape.gradient(Q_loss, Q.trainable_variables)
optimizer.apply_gradients(zip(gradients, Q.trainable_variables))
Qs_losses.append(Q_losses)
Qs_values.append(Q_values)
return Qs_values, Qs_losses
@tf.function(experimental_relax_shapes=True)
def _update_actor(self, batch):
"""Update the policy.
Creates a `tf.optimizer.minimize` operations for updating
policy and entropy with gradient descent, and adds them to
`self._training_ops` attribute.
See Section 4.2 in [1], for further information of the policy update,
and Section 5 in [1] for further information of the entropy update.
"""
observations = batch['observations']
entropy_scale = tf.convert_to_tensor(self._alpha)
with tf.GradientTape() as tape:
actions, log_pis = self._policy.actions_and_log_probs(observations)
Qs_log_targets = tuple(
Q.values(observations, actions) for Q in self._Qs)
Q_log_targets = tf.reduce_mean(Qs_log_targets, axis=0)
policy_losses = entropy_scale * log_pis - Q_log_targets
policy_loss = tf.nn.compute_average_loss(policy_losses)
tf.debugging.assert_shapes((
(actions, ('B', 'nA')),
(log_pis, ('B', 1)),
(policy_losses, ('B', 1)),
))
policy_gradients = tape.gradient(
policy_loss, self._policy.trainable_variables)
self._policy_optimizer.apply_gradients(zip(
policy_gradients, self._policy.trainable_variables))
return policy_losses
@tf.function(experimental_relax_shapes=True)
def _update_alpha(self, batch):
if not isinstance(self._target_entropy, Number):
return 0.0
observations = batch['observations']
actions, log_pis = self._policy.actions_and_log_probs(observations)
with tf.GradientTape() as tape:
alpha_losses = -1.0 * (
self._alpha * tf.stop_gradient(log_pis + self._target_entropy))
# NOTE(hartikainen): It's important that we take the average here,
# otherwise we end up effectively having `batch_size` times too
# large learning rate.
alpha_loss = tf.nn.compute_average_loss(alpha_losses)
alpha_gradients = tape.gradient(alpha_loss, [self._log_alpha])
self._alpha_optimizer.apply_gradients(zip(
alpha_gradients, [self._log_alpha]))
return alpha_losses
@tf.function(experimental_relax_shapes=True)
def _update_target(self, tau):
for Q, Q_target in zip(self._Qs, self._Q_targets):
for source_weight, target_weight in zip(
Q.trainable_variables, Q_target.trainable_variables):
target_weight.assign(
tau * source_weight + (1.0 - tau) * target_weight)
@tf.function(experimental_relax_shapes=True)
def _do_updates(self, batch):
"""Runs the update operations for policy, Q, and alpha."""
Qs_values, Qs_losses = self._update_critic(batch)
policy_losses = self._update_actor(batch)
alpha_losses = self._update_alpha(batch)
diagnostics = OrderedDict((
('Q_value-mean', tf.reduce_mean(Qs_values)),
('Q_loss-mean', tf.reduce_mean(Qs_losses)),
('policy_loss-mean', tf.reduce_mean(policy_losses)),
('alpha', tf.convert_to_tensor(self._alpha)),
('alpha_loss-mean', tf.reduce_mean(alpha_losses)),
))
return diagnostics
def _do_training(self, iteration, batch):
training_diagnostics = self._do_updates(batch)
if iteration % self._target_update_interval == 0:
# Run target ops here.
self._update_target(tau=tf.constant(self._tau))
return training_diagnostics
def get_diagnostics(self,
iteration,
batch,
training_paths,
evaluation_paths):
"""Return diagnostic information as an ordered dictionary.
Also calls the `draw` method of the plotter, if plotter defined.
"""
diagnostics = OrderedDict((
('alpha', self._alpha.numpy()),
('policy', self._policy.get_diagnostics_np(batch['observations'])),
))
if self._plotter:
self._plotter.draw()
return diagnostics
@property
def tf_saveables(self):
saveables = {
'_policy_optimizer': self._policy_optimizer,
**{
f'Q_optimizer_{i}': optimizer
for i, optimizer in enumerate(self._Q_optimizers)
},
'_alpha': self._alpha,
}
if hasattr(self, '_alpha_optimizer'):
saveables['_alpha_optimizer'] = self._alpha_optimizer
return saveables
本博客是博主个人学习时的一些记录,不保证是为原创,个别文章加入了转载的源地址,还有个别文章是汇总网上多份资料所成,在这之中也必有疏漏未加标注处,如有侵权请与博主联系。
如果未特殊标注则为原创,遵循 CC 4.0 BY-SA 版权协议。
posted on 2024-12-28 12:44 Angry_Panda 阅读(354) 评论(0) 收藏 举报
浙公网安备 33010602011771号