# Copyright 2019 The PlaNet Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """In-graph simulation step of a vectorized algorithm with environments.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import functools import tensorflow as tf from planet import tools from planet.control import batch_env from planet.control import in_graph_batch_env from planet.control import mpc_agent from planet.control import wrappers from planet.tools import streaming_mean def simulate( step, env_ctor, duration, num_agents, agent_config, isolate_envs='none', expensive_summaries=False, gif_summary=True, name='simulate'): summaries = [] with tf.variable_scope(name): return_, image, action, reward, cleanup = collect_rollouts( step=step, env_ctor=env_ctor, duration=duration, num_agents=num_agents, agent_config=agent_config, isolate_envs=isolate_envs) return_mean = tf.reduce_mean(return_) summaries.append(tf.summary.scalar('return', return_mean)) if expensive_summaries: summaries.append(tf.summary.histogram('return_hist', return_)) summaries.append(tf.summary.histogram('reward_hist', reward)) summaries.append(tf.summary.histogram('action_hist', action)) summaries.append(tools.image_strip_summary( 'image', image, max_length=duration)) if gif_summary: summaries.append(tools.gif_summary( 'animation', image, max_outputs=1, fps=20)) summary = tf.summary.merge(summaries) return summary, return_mean, cleanup def collect_rollouts( step, env_ctor, duration, num_agents, agent_config, isolate_envs): batch_env = define_batch_env(env_ctor, num_agents, isolate_envs) agent = mpc_agent.MPCAgent(batch_env, step, False, False, agent_config) cleanup = lambda: batch_env.close() def simulate_fn(unused_last, step): done, score, unused_summary = simulate_step( batch_env, agent, log=False, reset=tf.equal(step, 0)) with tf.control_dependencies([done, score]): image = batch_env.observ batch_action = batch_env.action batch_reward = batch_env.reward return done, score, image, batch_action, batch_reward initializer = ( tf.zeros([num_agents], tf.bool), tf.zeros([num_agents], tf.float32), 0 * batch_env.observ, 0 * batch_env.action, tf.zeros([num_agents], tf.float32)) done, score, image, action, reward = tf.scan( simulate_fn, tf.range(duration), initializer, parallel_iterations=1) score = tf.boolean_mask(score, done) image = tf.transpose(image, [1, 0, 2, 3, 4]) action = tf.transpose(action, [1, 0, 2]) reward = tf.transpose(reward) return score, image, action, reward, cleanup def define_batch_env(env_ctor, num_agents, isolate_envs): with tf.variable_scope('environments'): if isolate_envs == 'none': factory = lambda ctor: ctor() blocking = True elif isolate_envs == 'thread': factory = functools.partial(wrappers.Async, strategy='thread') blocking = False elif isolate_envs == 'process': factory = functools.partial(wrappers.Async, strategy='process') blocking = False else: raise NotImplementedError(isolate_envs) envs = [factory(env_ctor) for _ in range(num_agents)] env = batch_env.BatchEnv(envs, blocking) env = in_graph_batch_env.InGraphBatchEnv(env) return env def simulate_step(batch_env, algo, log=True, reset=False): """Simulation step of a vectorized algorithm with in-graph environments. Integrates the operations implemented by the algorithm and the environments into a combined operation. Args: batch_env: In-graph batch environment. algo: Algorithm instance implementing required operations. log: Tensor indicating whether to compute and return summaries. reset: Tensor causing all environments to reset. Returns: Tuple of tensors containing done flags for the current episodes, possibly intermediate scores for the episodes, and a summary tensor. """ def _define_begin_episode(agent_indices): """Reset environments, intermediate scores and durations for new episodes. Args: agent_indices: Tensor containing batch indices starting an episode. Returns: Summary tensor, new score tensor, and new length tensor. """ assert agent_indices.shape.ndims == 1 zero_scores = tf.zeros_like(agent_indices, tf.float32) zero_durations = tf.zeros_like(agent_indices) update_score = tf.scatter_update(score_var, agent_indices, zero_scores) update_length = tf.scatter_update( length_var, agent_indices, zero_durations) reset_ops = [ batch_env.reset(agent_indices), update_score, update_length] with tf.control_dependencies(reset_ops): return algo.begin_episode(agent_indices), update_score, update_length def _define_step(): """Request actions from the algorithm and apply them to the environments. Increments the lengths of all episodes and increases their scores by the current reward. After stepping the environments, provides the full transition tuple to the algorithm. Returns: Summary tensor, new score tensor, and new length tensor. """ prevob = batch_env.observ + 0 # Ensure a copy of the variable value. agent_indices = tf.range(len(batch_env)) action, step_summary = algo.perform(agent_indices, prevob) action.set_shape(batch_env.action.shape) with tf.control_dependencies([batch_env.step(action)]): add_score = score_var.assign_add(batch_env.reward) inc_length = length_var.assign_add(tf.ones(len(batch_env), tf.int32)) with tf.control_dependencies([add_score, inc_length]): agent_indices = tf.range(len(batch_env)) experience_summary = algo.experience( agent_indices, prevob, batch_env.action, batch_env.reward, batch_env.done, batch_env.observ) summary = tf.summary.merge([step_summary, experience_summary]) return summary, add_score, inc_length def _define_end_episode(agent_indices): """Notify the algorithm of ending episodes. Also updates the mean score and length counters used for summaries. Args: agent_indices: Tensor holding batch indices that end their episodes. Returns: Summary tensor. """ assert agent_indices.shape.ndims == 1 submit_score = mean_score.submit(tf.gather(score, agent_indices)) submit_length = mean_length.submit( tf.cast(tf.gather(length, agent_indices), tf.float32)) with tf.control_dependencies([submit_score, submit_length]): return algo.end_episode(agent_indices) def _define_summaries(): """Reset the average score and duration, and return them as summary. Returns: Summary string. """ score_summary = tf.cond( tf.logical_and(log, tf.cast(mean_score.count, tf.bool)), lambda: tf.summary.scalar('mean_score', mean_score.clear()), str) length_summary = tf.cond( tf.logical_and(log, tf.cast(mean_length.count, tf.bool)), lambda: tf.summary.scalar('mean_length', mean_length.clear()), str) return tf.summary.merge([score_summary, length_summary]) with tf.name_scope('simulate'): log = tf.convert_to_tensor(log) reset = tf.convert_to_tensor(reset) with tf.variable_scope('simulate_temporary'): score_var = tf.get_variable( 'score', (len(batch_env),), tf.float32, tf.constant_initializer(0), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) length_var = tf.get_variable( 'length', (len(batch_env),), tf.int32, tf.constant_initializer(0), trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) mean_score = streaming_mean.StreamingMean((), tf.float32, 'mean_score') mean_length = streaming_mean.StreamingMean((), tf.float32, 'mean_length') agent_indices = tf.cond( reset, lambda: tf.range(len(batch_env)), lambda: tf.cast(tf.where(batch_env.done)[:, 0], tf.int32)) begin_episode, score, length = tf.cond( tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_begin_episode(agent_indices), lambda: (str(), score_var, length_var)) with tf.control_dependencies([begin_episode]): step, score, length = _define_step() with tf.control_dependencies([step]): agent_indices = tf.cast(tf.where(batch_env.done)[:, 0], tf.int32) end_episode = tf.cond( tf.cast(tf.shape(agent_indices)[0], tf.bool), lambda: _define_end_episode(agent_indices), str) with tf.control_dependencies([end_episode]): summary = tf.summary.merge([ _define_summaries(), begin_episode, step, end_episode]) with tf.control_dependencies([summary]): score = 0.0 + score done = batch_env.done return done, score, summary