python source code of goal_nav

# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Importance Weighted Actor-Learner Architecture goalless navigation agent.

Note that this is a modification of code previously published by Lasse Espeholt
under an Apache license at:
https://github.com/deepmind/scalable_agent
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import functools

from six.moves import range
from six.moves import zip
import sonnet as snt
import tensorflow as tf

import streetlearn.python.agents.locale_pathway as locale_pathway
from tensorflow.contrib import framework as contrib_framework

nest = contrib_framework.nest

AgentOutput = collections.namedtuple(
    "AgentOutput", "action policy_logits baseline heading xy target_xy")


class GoalNavAgent(snt.RNNCore):
  """Agent with a simple residual convnet and LSTM."""

  def __init__(self,
               num_actions,
               observation_names,
               goal_type='target_latlng',
               heading_stop_gradient=False,
               heading_num_hiddens=256,
               heading_num_bins=16,
               xy_stop_gradient=True,
               xy_num_hiddens=256,
               xy_num_bins_lat=32,
               xy_num_bins_lng=32,
               target_xy_stop_gradient=True,
               dropout=0.5,
               lstm_num_hiddens=256,
               feed_action_and_reward=True,
               max_reward=1.0,
               name="streetlearn_core"):
    """Initializes an agent core designed to be used with A3C/IMPALA.

    Supports a single visual observation tensor and goal instruction tensor and
    outputs a single, scalar discrete action with policy logits and a baseline
    value, as well as the agent heading prediction.

    Args:
      num_actions: Number of actions available.
      observation_names: String with observation names separated by semi-colon.
      goal_type: String with the name of the target observation field, can be
          `target_latlng` or `target_landmarks`.
      heading_stop_gradient: Boolean for stopping gradient between the LSTM core
          and the heading prediction MLP.
      heading_num_hiddens: Number of hiddens in the heading prediction MLP.
      heading_num_bins: Number of outputs in the heading prediction MLP.
      xy_stop_gradient: Boolean for stopping gradient between the LSTM core
          and the XY position prediction MLP.
      xy_num_hiddens: Number of hiddens in the XY position prediction MLP.
      xy_num_bins_lat: Number of lat outputs in the XY position prediction MLP.
      xy_num_bins_lng: Number of lng outputs in the XY position prediction MLP.
      target_xy_stop_gradient: Boolean for stopping gradient between the LSTM
          core and the target XY position prediction MLP.
      dropout: Dropout probabibility after the locale pathway.
      lstm_num_hiddens: Number of hiddens in the LSTM core.
      feed_action_and_reward: If True, the last action (one hot) and last reward
          (scalar) will be concatenated to the torso.
      max_reward: If `feed_action_and_reward` is True, the last reward will
          be clipped to `[-max_reward, max_reward]`. If `max_reward`
          is None, no clipping will be applied. N.B., this is different from
          reward clipping during gradient descent, or reward clipping by the
          environment.
      name: Optional name for the module.
    """
    super(GoalNavAgent, self).__init__(name='agent')

    # Policy config
    self._num_actions = num_actions
    tf.logging.info('Agent trained on %d-action policy', self._num_actions)
    # Append last reward (clipped) and last action?
    self._feed_action_and_reward = feed_action_and_reward
    self._max_reward = max_reward
    # Policy LSTM core config
    self._lstm_num_hiddens = lstm_num_hiddens
    # Extract the observation names
    observation_names = observation_names.split(';')
    self._idx_frame = observation_names.index('view_image')
    tf.logging.info('Looking for goal of type %s', goal_type)
    self._idx_goal = observation_names.index(goal_type)

    with self._enter_variable_scope():
      # Convnet
      self._convnet = snt.nets.ConvNet2D(
            output_channels=(16, 32),
            kernel_shapes=(8, 4),
            strides=(4, 2),
            paddings=[snt.VALID],
            activation=tf.nn.relu,
            activate_final=True)
      # Recurrent LSTM core of the agent.
      tf.logging.info('Locale pathway LSTM core with %d hiddens',
                      self._lstm_num_hiddens)
      self._locale_pathway = locale_pathway.LocalePathway(
          heading_stop_gradient, heading_num_hiddens, heading_num_bins,
          xy_stop_gradient, xy_num_hiddens, xy_num_bins_lat, xy_num_bins_lng,
          target_xy_stop_gradient, lstm_num_hiddens, dropout)

  def initial_state(self, batch_size):
    """Return initial state with zeros, for a given batch size and data type."""
    tf.logging.info("Initial state consists of the LSTM core initial state.")
    return self._locale_pathway.initial_state(batch_size)

  def _torso(self, input_):
    """Processing of all the visual and language inputs to the LSTM core."""

    # Extract the inputs
    last_action, env_output = input_
    last_reward, _, _, observation = env_output
    frame = observation[self._idx_frame]
    goal = observation[self._idx_goal]
    goal = tf.to_float(goal)

    # Convert to image to floats and normalise.
    frame = tf.to_float(frame)
    frame = snt.FlattenTrailingDimensions(dim_from=3)(frame)
    frame /= 255.0

    # Feed image through convnet.
    with tf.variable_scope('convnet'):
      # Convolutional layers.
      conv_out = self._convnet(frame)
      # Fully connected layer.
      conv_out = snt.BatchFlatten()(conv_out)
      conv_out = snt.Linear(256)(conv_out)
      conv_out = tf.nn.relu(conv_out)

    # Concatenate outputs of the visual and instruction pathways.
    if self._feed_action_and_reward:
      # Append clipped last reward and one hot last action.
      tf.logging.info('Append last reward clipped to: %f', self._max_reward)
      clipped_last_reward = tf.expand_dims(
          tf.clip_by_value(last_reward, -self._max_reward, self._max_reward),
          -1)
      tf.logging.info('Append last action (one-hot of %d)', self._num_actions)
      one_hot_last_action = tf.one_hot(last_action, self._num_actions)
      tf.logging.info('Append goal:')
      tf.logging.info(goal)
      action_and_reward = tf.concat([clipped_last_reward, one_hot_last_action],
                                    axis=1)
    else:
      action_and_reward = tf.constant([0], dtype=tf.float32)
    return conv_out, action_and_reward, goal

  def _core(self, core_input, core_state):
    """Assemble the recurrent core network components."""
    (conv_output, action_reward, goal) = core_input
    locale_input = tf.concat([conv_output, action_reward], axis=1)
    core_output, core_state = self._locale_pathway((locale_input, goal),
                                                   core_state)
    return core_output, core_state

  def _head(self, policy_input, heading, xy, target_xy):
    """Build the head of the agent: linear policy and value function, and pass
    the auxiliary outputs through.
    """

    # Linear policy and value function.
    policy_logits = snt.Linear(
        self._num_actions, name='policy_logits')(policy_input)
    baseline = tf.squeeze(snt.Linear(1, name='baseline')(policy_input), axis=-1)

    # Sample an action from the policy.
    new_action = tf.multinomial(
        policy_logits, num_samples=1, output_dtype=tf.int32)
    new_action = tf.squeeze(new_action, 1, name='new_action')

    return AgentOutput(
        new_action, policy_logits, baseline, heading, xy, target_xy)

  def _build(self, input_, core_state):
    """Assemble the network components."""
    action, env_output = input_
    actions, env_outputs = nest.map_structure(lambda t: tf.expand_dims(t, 0),
                                              (action, env_output))
    outputs, core_state = self.unroll(actions, env_outputs, core_state)
    return nest.map_structure(lambda t: tf.squeeze(t, 0), outputs), core_state

  @snt.reuse_variables
  def unroll(self, actions, env_outputs, core_state):
    """Manual implementation of the network unroll."""
    _, _, done, _ = env_outputs

    torso_outputs = snt.BatchApply(self._torso)((actions, env_outputs))
    tf.logging.info(torso_outputs)
    conv_outputs, actions_and_rewards, goals = torso_outputs

    # Note, in this implementation we can't use CuDNN RNN to speed things up due
    # to the state reset. This can be XLA-compiled (LSTMBlockCell needs to be
    # changed to implement snt.LSTMCell).
    initial_core_state = self.initial_state(tf.shape(actions)[1])
    policy_input_list = []
    heading_output_list = []
    xy_output_list = []
    target_xy_output_list = []
    for torso_output_, action_and_reward_, goal_, done_ in zip(
        tf.unstack(conv_outputs),
        tf.unstack(actions_and_rewards),
        tf.unstack(goals),
        tf.unstack(done)):
      # If the episode ended, the core state should be reset before the next.
      core_state = nest.map_structure(
          functools.partial(tf.where, done_), initial_core_state, core_state)
      core_output, core_state = self._core(
          (torso_output_, action_and_reward_, goal_), core_state)
      policy_input_list.append(core_output[0])
      heading_output_list.append(core_output[1])
      xy_output_list.append(core_output[2])
      target_xy_output_list.append(core_output[3])
    head_output = snt.BatchApply(self._head)(tf.stack(policy_input_list),
                                             tf.stack(heading_output_list),
                                             tf.stack(xy_output_list),
                                             tf.stack(target_xy_output_list))

    return head_output, core_state