python source code of bdqn

import tensorflow as tf

from rltf.models      import DDQN
from rltf.tf_utils    import BLR, tf_utils


class BDQN(DDQN):
  """Bayesian Double DQN"""

  def __init__(self, sigma_e, tau, mode="mean", **kwargs):
    """
    Args:
      obs_shape: list. Shape of the observation tensor
      n_actions: int. Number of possible actions
      opt_conf: rltf.optimizers.OptimizerConf. Configuration for the optimizer
      gamma: float. Discount factor
      sigma_e: float. Standard deviation of the noise observation for BLR
      tau: float. Standard deviation for the weight prior in BLR
      huber_loss: bool. Whether to use huber loss or not
    """

    super().__init__(**kwargs)

    self.agent_blr  = [BLR(tau=tau, sigma_e=sigma_e, mode=mode)   for _ in range(self.n_actions)]
    self.target_blr = [BLR(tau=tau, sigma_e=sigma_e, mode="mean") for _ in range(self.n_actions)]

    # Custom TF Tensors and Ops
    self._target    = None    # BLR target
    self._phi       = None    # BLR features
    self.train_blr  = None    # Op for updating the BLR weight posterior
    self.reset_blr  = None    # Op for reseting the BLR to initial weights
    self.a_var      = None    # Tensor with BLR var


  def build(self):
    super().build()
    self.reset_blr = tf.group(*[blr.reset_op for blr in self.agent_blr], name="reset_blr")


  def _conv_nn(self, x):
    """ Build the DQN architecture - as described in the original paper
    Args:
      x: tf.Tensor. Tensor for the input
    Returns:
      `tf.Tensor` of shape `[batch_size, n_actions]`. Contains the Q-function for each action
    """
    with tf.variable_scope("conv_net"):
      # original architecture
      x = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4, padding="SAME", activation=tf.nn.relu)
      x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding="SAME", activation=tf.nn.relu)
      x = tf.layers.conv2d(x, filters=64, kernel_size=3, strides=1, padding="SAME", activation=tf.nn.relu)
    x = tf.layers.flatten(x)
    with tf.variable_scope("action_value"):
      x = tf.layers.dense(x, units=512, activation=tf.nn.relu)
      # Normalize features
      # if self.phi_norm:
      #   x = tf.layers.batch_normalization(phi, axis=-1, training=tf.not_equal(tf.shape(x)[0], 1))
      blrs = self.agent_blr if "agent_net" in tf.get_variable_scope().name else self.target_blr

      # Compute the mean and std prediction from BLR
      blr_out = [blr.apply(x) for blr in blrs]
      # Remember phi and the stds
      if "agent_net" in tf.get_variable_scope().name and self._phi is None:
        self._phi  = x
        self.a_var = tf.concat([var for (_, var) in blr_out], axis=-1)
      # Group the mean predictions
      x = [mean for (mean, _) in blr_out]
      x = tf.concat(x, axis=-1)

    return x


  def _build_train_blr_op(self, phi, target, name):
    """Build the Bayesian Linear Regression ops and estimates
    Args:
      phi: tf.Tensor, shape: `[None, dim_phi]`. The feature tensor
      target: tf.Tensor, as returned by `self._compute_target()`; `[None]`
    Returns:
      tf.Op: The train Op for BLR
    """
    target = tf.expand_dims(target, axis=-1)

    def train_blr(blr, a):
      """Given a BLR instance, select only the examples for the corresponding action"""
      mask = tf.expand_dims(tf.equal(self.act_t_ph, a), axis=-1)
      mask = tf.cast(mask, tf.float32)  # out shape: [None]
      X = phi * mask                    # out shape: [None, dim_phi]
      y = target * mask                 # out shape: [None, 1]
      return blr.train(X, y)

    w_updates = [train_blr(blr, i) for i, blr in enumerate(self.agent_blr)]

    return tf.group(*w_updates, name=name)


  def _compute_target(self, target_net):
    target        = super()._compute_target(target_net)
    self._target  = target
    return target


  def _build_train_op(self, optimizer, loss, agent_vars, name):
    self.train_blr = self._build_train_blr_op(self._phi, self._target, name="train_blr")

    return super()._build_train_op(optimizer, loss, agent_vars, name)



class BDQN_TS(BDQN):
  """Bayesian Double DQN with Thompson Sampling exploration policy"""

  def __init__(self, **kwargs):

    super().__init__(mode="ts", **kwargs)

    # Custom TF Tensors and Ops
    self.reset_ts   = None    # Op that resamples the parameters for TS


  def build(self):
    super().build()

    agent_w   = [blr.w            for blr in self.agent_blr]
    target_w  = [blr.resample_w() for blr in self.target_blr]

    self.reset_ts = tf_utils.assign_vars(agent_w, target_w, name="reset_ts")


  def reset(self, sess):
    sess.run(self.reset_ts)



class BDQN_UCB(BDQN):
  """Bayesian Double DQN with UCB exploration policy"""

  def __init__(self, n_stds, **kwargs):

    super().__init__(mode="mean", **kwargs)

    self.n_stds = n_stds       # Scale constant for computing uncertainty


  def _act_train(self, agent_net, name):
    mean    = agent_net
    std     = tf.sqrt(self.a_var)
    action  = tf.argmax(mean + self.n_stds * std, axis=-1, output_type=tf.int32, name=name)

    # Add debug histograms
    tf.summary.histogram("debug/a_std",   std)
    tf.summary.histogram("debug/a_mean",  mean)

    return dict(action=action)



class BDQN_IDS(BDQN):
  """Bayesian Double DQN with IDS exploration policy"""

  def __init__(self, n_stds, **kwargs):
    super().__init__(mode="mean", **kwargs)

    self.n_stds = n_stds       # Scale constant for computing uncertainty
    self.rho    = 1.0


  def _act_train(self, agent_net, name):
    mean      = agent_net
    var       = self.a_var
    std       = tf.sqrt(var)
    regret    = tf.reduce_max(mean + self.n_stds * std, axis=-1, keepdims=True)
    regret    = regret - (mean - self.n_stds * std)
    regret_sq = tf.square(regret)
    info_gain = tf.log(1 + var / self.rho**2) + 1e-5
    ids_score = tf.div(regret_sq, info_gain)
    ids_score = tf.check_numerics(ids_score, "IDS score is NaN or Inf")

    action    = tf.argmin(ids_score, axis=-1, output_type=tf.int32, name=name)

    # Add debug histograms
    tf.summary.histogram("debug/a_mean",    mean)
    tf.summary.histogram("debug/a_std",     std)
    tf.summary.histogram("debug/a_regret",  regret)
    tf.summary.histogram("debug/a_info",    info_gain)
    tf.summary.histogram("debug/a_ids",     ids_score)

    # Set the plottable tensors for video. Use only the first action in the batch
    p_a     = tf.identity(action[0],    name="plot/train/a")
    p_mean  = tf.identity(mean[0],      name="plot/train/mean")
    p_std   = tf.identity(std[0],       name="plot/train/std")
    p_ids   = tf.identity(ids_score[0], name="plot/train/ids")

    train_actions = {
      "a_mean": dict(height=p_mean, a=p_a),
      "a_std":  dict(height=p_std,  a=p_a),
      "a_ids":  dict(height=p_ids,  a=p_a),
    }
    self.plot_conf.set_train_spec(dict(train_actions=train_actions))

    return dict(action=action)


  def _act_eval(self, agent_net, name):
    action = super()._act_eval(agent_net, name)

    # Set the plottable tensors for train
    p_a     = tf.identity(action["action"][0],  name="plot/eval/a")
    p_mean  = tf.identity(agent_net[0],         name="plot/eval/mean")
    # Set the plottable tensors for episode recordings
    eval_actions = {
      "a_mean": dict(height=p_mean, a=p_a),
    }
    self.plot_conf.set_eval_spec(dict(eval_actions=eval_actions))

    return action