# Copyright 2018 The trfl Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """TensorFlow ops for continuous-action Policy Gradient algorithms.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections # Dependency imports from six.moves import zip import tensorflow.compat.v1 as tf import tree as nest from trfl import base_ops from trfl import value_ops PolicyEntropyExtra = collections.namedtuple("policy_entropy_extra", ["entropy"]) SequenceA2CExtra = collections.namedtuple( "sequence_a2c_extra", ["entropy", "entropy_loss", "baseline_loss", "policy_gradient_loss", "advantages", "discounted_returns"]) def policy_gradient(policies, actions, action_values, policy_vars=None, name="policy_gradient"): """Computes policy gradient losses for a batch of trajectories. See `policy_gradient_loss` for more information on expected inputs and usage. Args: policies: A distribution over a batch supporting a `log_prob` method, e.g. an instance of `tfp.distributions.Distribution`. For example, for a diagonal gaussian policy: `policies = tfp.distributions.MultivariateNormalDiag(mus, sigmas)` actions: An action batch Tensor used as the argument for `log_prob`. Has shape equal to the batch shape of the policies concatenated with the event shape of the policies (which may be scalar, in which case concatenation leaves shape just equal to batch shape). action_values: A Tensor containing estimates of the values of the `actions`. Has shape equal to the batch shape of the policies. policy_vars: An optional iterable of Tensors used by `policies`. If provided is used in scope checks. For the multivariate normal example above this would be `[mus, sigmas]`. name: Customises the name_scope for this op. Returns: loss: Tensor with same shape as `actions` containing the total loss for each element in the batch. Differentiable w.r.t the variables in `policies` only. """ policy_vars = list(policy_vars) if policy_vars else list() with tf.name_scope(values=policy_vars + [actions, action_values], name=name): actions = tf.stop_gradient(actions) action_values = tf.stop_gradient(action_values) log_prob_actions = policies.log_prob(actions) # Prevent accidental broadcasting if possible at construction time. action_values.get_shape().assert_is_compatible_with( log_prob_actions.get_shape()) return -tf.multiply(log_prob_actions, action_values) def policy_gradient_loss(policies, actions, action_values, policy_vars=None, name="policy_gradient_loss"): """Computes policy gradient losses for a batch of trajectories. This wraps `policy_gradient` to accept a possibly nested array of `policies` and `actions` in order to allow for multiple action distribution types or independent multivariate distributions if not directly available. It also sums up losses along the time dimension, and is more restrictive about shapes, assuming a [T, B] layout for the `batch_shape` of the policies and a concatenate(`[T, B]`, `event_shape` of the policies) shape for the actions. Args: policies: A (possibly nested structure of) distribution(s) supporting `batch_shape` and `event_shape` properties along with a `log_prob` method (e.g. an instance of `tfp.distributions.Distribution`), with `batch_shape` equal to `[T, B]`. actions: A (possibly nested structure of) N-D Tensor(s) with shape `[T, B, ...]` where the final dimensions are the `event_shape` of the corresponding distribution in the nested structure (the shape can be just `[T, B]` if the `event_shape` is scalar). action_values: Tensor of shape `[T, B]` containing an estimate of the value of the selected `actions`. policy_vars: An optional (possibly nested structure of) iterable(s) of Tensors used by `policies`. If provided is used in scope checks. name: Customises the name_scope for this op. Returns: loss: Tensor of shape `[B]` containing the total loss for each sequence in the batch. Differentiable w.r.t `policy_logits` only. """ actions = nest.flatten(actions) if policy_vars: policy_vars = nest.flatten_up_to(policies, policy_vars) else: policy_vars = [list()] * len(actions) policies = nest.flatten(policies) # Check happens after flatten so that we can be more flexible on nest # structures. This is equivalent to asserting that `len(policies) == # len(actions)`, which is sufficient for what we're doing here. nest.assert_same_structure(policies, actions) for policies_, actions_ in zip(policies, actions): policies_.batch_shape.assert_has_rank(2) actions_.get_shape().assert_is_compatible_with( policies_.batch_shape.concatenate(policies_.event_shape)) scoped_values = policy_vars + actions + [action_values] with tf.name_scope(name, values=scoped_values): # Loss for the policy gradient. Doesn't push additional gradients through # the action_values. policy_gradient_loss_sequence = tf.add_n([ policy_gradient(policies_, actions_, action_values, pvars) for policies_, actions_, pvars in zip(policies, actions, policy_vars)]) return tf.reduce_sum( policy_gradient_loss_sequence, axis=[0], name="policy_gradient_loss") def policy_entropy_loss(policies, policy_vars=None, scale_op=None, name="policy_entropy_loss"): """Calculates entropy 'loss' for policies represented by a distributions. Given a (possible nested structure of) batch(es) of policies, this calculates the total entropy and corrects the sign so that minimizing the resulting loss op is equivalent to increasing entropy in the batch. This function accepts a nested structure of `policies` in order to allow for multiple distribution types or for multiple action dimensions in the case where there is no corresponding mutivariate form for available for a given univariate distribution. In this case, the loss is `sum_i(H(p_i, p_i))` where `p_i` are members of the `policies` nest. It can be shown that this is equivalent to calculating the entropy loss on the Cartesian product space over all the action dimensions, if the sampled actions are independent. The entropy loss is optionally scaled by some function of the policies. E.g. for Categorical distributions there exists such a scaling which maps the entropy loss into the range `[-1, 0]` in order to make it invariant to the size of the action space - specifically one can divide the loss by `sum_i(log(A_i))` where `A_i` is the number of categories in the i'th Categorical distribution in the `policies` nest). Args: policies: A (possibly nested structure of) batch distribution(s) supporting an `entropy` method that returns an N-D Tensor with shape equal to the `batch_shape` of the distribution, e.g. an instance of `tfp.distributions.Distribution`. policy_vars: An optional (possibly nested structure of) iterable(s) of Tensors used by `policies`. If provided is used in scope checks. scale_op: An optional op that takes `policies` as its only argument and returns a scalar Tensor that is used to scale the entropy loss. E.g. for Diag(sigma) Gaussian policies dividing by the number of dimensions makes entropy loss invariant to the action space dimension. name: Optional, name of this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B1, B2, ...]`. * `extra`: a namedtuple with fields: * `entropy`: entropy of the policy, shape `[B1, B2, ...]`. where [B1, B2, ... ] == policy.batch_shape """ flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list() with tf.name_scope(name, values=flat_policy_vars): # We want a value that we can minimize along with other losses, and where # minimizing means driving the policy towards a uniform distribution over # the actions. We thus scale it by negative one so that it can be simply # added to other losses. scale = tf.constant(-1.0, dtype=tf.float32) if scale_op: scale *= scale_op(policies) policies = nest.flatten(policies) entropy = tf.add_n( [policy.entropy() for policy in policies], name="entropy") loss = tf.multiply(scale, entropy, name="entropy_loss") return base_ops.LossOutput(loss, PolicyEntropyExtra(entropy)) def sequence_a2c_loss(policies, baseline_values, actions, rewards, pcontinues, bootstrap_value, policy_vars=None, lambda_=1, entropy_cost=None, baseline_cost=1, entropy_scale_op=None, name="SequenceA2CLoss"): """Constructs a TensorFlow graph computing the A2C/GAE loss for sequences. This loss jointly learns the policy and the baseline. Therefore, gradients for this loss flow through each tensor in `policies` and through each tensor in `baseline_values`, but no other input tensors. The policy is learnt with the advantage actor-critic loss, plus an optional entropy term. The baseline is regressed towards the n-step bootstrapped returns given by the reward/pcontinue sequence. The `baseline_cost` parameter scales the gradients w.r.t the baseline relative to the policy gradient, i.e. d(loss) / d(baseline) = baseline_cost * (n_step_return - baseline)`. This function is designed for batches of sequences of data. Tensors are assumed to be time major (i.e. the outermost dimension is time, the second outermost dimension is the batch dimension). We denote the sequence length in the shapes of the arguments with the variable `T`, the batch size with the variable `B`, neither of which needs to be known at construction time. Index `0` of the time dimension is assumed to be the start of the sequence. `rewards` and `pcontinues` are the sequences of data taken directly from the environment, possibly modulated by a discount. `baseline_values` are the sequences of (typically learnt) estimates of the values of the states visited along a batch of trajectories as observed by the agent given the sequences of one or more actions sampled from `policies`. The sequences in the tensors should be aligned such that an agent in a state with value `V` that takes an action `a` transitions into another state with value `V'`, receiving reward `r` and pcontinue `p`. Then `V`, `a`, `r` and `p` are all at the same index `i` in the corresponding tensors. `V'` is at index `i+1`, or in the `bootstrap_value` tensor if `i == T`. For n-dimensional action vectors, a multivariate distribution must be used for `policies`. In case there is no multivariate version for the desired univariate distribution, or in case the `actions` object is a nested structure (e.g. for multiple action types), this function also accepts a nested structure of `policies`. In this case, the loss is given by `sum_i(loss(p_i, a_i))` where `p_i` are members of the `policies` nest, and `a_i` are members of the `actions` nest. We assume that a single baseline is used across all action dimensions for each timestep. Args: policies: A (possibly nested structure of) distribution(s) supporting `batch_shape` and `event_shape` properties & `log_prob` and `entropy` methods (e.g. an instance of `tfp.distributions.Distribution`), with `batch_shape` equal to `[T, B]`. E.g. for a (non-nested) diagonal multivariate gaussian with dimension `A` this would be: `policies = tfp.distributions.MultivariateNormalDiag(mus, sigmas)` where `mus` and `sigmas` have shape `[T, B, A]`. baseline_values: 2-D Tensor containing an estimate of the state value with shape `[T, B]`. actions: A (possibly nested structure of) N-D Tensor(s) with shape `[T, B, ...]` where the final dimensions are the `event_shape` of the corresponding distribution in the nested structure (the shape can be just `[T, B]` if the `event_shape` is scalar). rewards: 2-D Tensor with shape `[T, B]`. pcontinues: 2-D Tensor with shape `[T, B]`. bootstrap_value: 1-D Tensor with shape `[B]`. policy_vars: An optional (possibly nested structure of) iterables of Tensors used by `policies`. If provided is used in scope checks. For the multivariate normal example above this would be `[mus, sigmas]`. lambda_: an optional scalar or 2-D Tensor with shape `[T, B]` for Generalised Advantage Estimation as per https://arxiv.org/abs/1506.02438. entropy_cost: optional scalar cost that pushes the policy to have high entropy, larger values cause higher entropies. baseline_cost: scalar cost that scales the derivatives of the baseline relative to the policy gradient. entropy_scale_op: An optional op that takes `policies` as its only argument and returns a scalar Tensor that is used to scale the entropy loss. E.g. for Diag(sigma) Gaussian policies dividing by the number of dimensions makes entropy loss invariant to the action space dimension. See `policy_entropy_loss` for more info. name: Customises the name_scope for this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the total loss, shape `[B]`. * `extra`: a namedtuple with fields: * `entropy`: total loss per sequence, shape `[B]`. * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`. * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`. * `policy_gradient_loss`: policy gradient loss per sequence, shape `[B]`. * `advantages`: advantange estimates per timestep, shape `[T, B]`. * `discounted_returns`: discounted returns per timestep, shape `[T, B]`. """ flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list() scoped_values = (flat_policy_vars + nest.flatten(actions) + [baseline_values, rewards, pcontinues, bootstrap_value]) with tf.name_scope(name, values=scoped_values): # Loss for the baseline, summed over the time dimension. baseline_loss_td, td_lambda = value_ops.td_lambda( baseline_values, rewards, pcontinues, bootstrap_value, lambda_) # The TD error provides an estimate of the advantages of the actions. advantages = td_lambda.temporal_differences baseline_loss = tf.multiply( tf.convert_to_tensor(baseline_cost, dtype=tf.float32), baseline_loss_td, name="baseline_loss") # Loss for the policy. Doesn't push additional gradients through # the advantages. pg_loss = policy_gradient_loss( policies, actions, advantages, policy_vars, name="policy_gradient_loss") total_loss = tf.add(pg_loss, baseline_loss, name="total_loss") if entropy_cost is not None: loss, extra = policy_entropy_loss(policies, policy_vars, entropy_scale_op) entropy = tf.reduce_sum(extra.entropy, axis=0, name="entropy") # [B]. entropy_loss = tf.multiply( tf.convert_to_tensor(entropy_cost, dtype=tf.float32), tf.reduce_sum(loss, axis=0), name="scaled_entropy_loss") # [B]. total_loss = tf.add(total_loss, entropy_loss, name="total_loss_with_entropy") else: entropy = None entropy_loss = None extra = SequenceA2CExtra( entropy=entropy, entropy_loss=entropy_loss, baseline_loss=baseline_loss, policy_gradient_loss=pg_loss, advantages=advantages, discounted_returns=td_lambda.discounted_returns) return base_ops.LossOutput(total_loss, extra)