python source code of ppo

tensor2tensor-master
- AUTHORS
- floyd.yml
- tensor2tensor
  - serving
    - export.py
    - query.py
    - __init__.py
    - README.md
    - serving_utils.py
  - utils
    - video2gif.py
    - mtf_model.py
    - multistep_with_adamoptimizer_test.py
    - avg_checkpoints.py
    - expert_utils_test.py
    - data_reader_test.py
    - test_utils.py
    - quantization.py
    - trainer_lib_test.py
    - mlperf_log.py
    - hparams_lib_test.py
    - hparam_test.py
    - usr_dir.py
    - get_rouge.py
    - sari_hook.py
    - metrics_test.py
    - learning_rate.py
    - devices.py
    - yellowfin_test.py
    - misc_utils_test.py
    - metrics.py
    - scheduled_sampling.py
    - cloud_mlengine.py
    - decoding.py
    - adv_attack_utils.py
    - sari_hook_test.py
    - test_utils_test.py
    - rouge.py
    - video_metrics.py
    - diet_test.py
    - flags.py
    - adafactor.py
    - get_ende_bleu.sh
    - diet.py
    - data_reader.py
    - rouge_test.py
    - registry.py
    - restore_hook.py
    - checkpoint_compatibility_test.py
    - hparams_lib.py
    - multistep_with_adamoptimizer.py
    - t2t_model.py
    - yellowfin.py
    - __init__.py
    - multistep_optimizer_test.py
    - registry_test.py
    - misc_utils.py
    - video_metrics_test.py
    - bleu_hook_test.py
    - mlperf_tags.py
    - optimize.py
    - partial_checkpoint_load_hook.py
    - hparam.py
    - expert_utils.py
    - pruning_utils.py
    - compute_video_metrics.py
    - optimize_test.py
    - t2t_model_test.py
    - metrics_hook_test.py
    - beam_search.py
    - contrib.py
    - beam_search_test.py
    - trainer_lib.py
    - update_ops_hook.py
    - get_cnndm_rouge.sh
    - metrics_hook.py
    - multistep_optimizer.py
    - video
      - reward_confusion.py
      - prediction2gif.py
    - bleu_hook.py
  - layers
    - common_video.py
    - discretization.py
    - latent_layers.py
    - transformer_glow_layers_ops_test.py
    - transformer_glow_layers.py
    - transformer_memory.py
    - discretization_test.py
    - transformer_glow_layers_test.py
    - transformer_memory_test.py
    - common_hparams.py
    - modalities_test.py
    - area_attention_test.py
    - message_passing_attention.py
    - latent_layers_test.py
    - transformer_glow_layers_ops.py
    - common_image_attention.py
    - common_layers.py
    - ngram.py
    - common_attention_test.py
    - common_image_attention_test.py
    - ngram_test.py
    - vqa_layers.py
    - common_video_test.py
    - __init__.py
    - vq_discrete.py
    - modalities.py
    - common_layers_test.py
    - area_attention.py
    - transformer_layers.py
    - common_audio.py
  - problems_colab.py
  - models
    - revnet_test.py
    - slicenet.py
    - mtf_image_transformer.py
    - distillation.py
    - neural_assistant.py
    - transformer.py
    - basic_test.py
    - slicenet_test.py
    - neural_gpu.py
    - basic.py
    - bytenet_test.py
    - image_transformer_2d.py
    - mtf_resnet.py
    - text_cnn.py
    - evolved_transformer.py
    - xception.py
    - neural_architecture_search
      - nas_model_test.py
      - nas_layers.py
      - __init__.py
      - README.md
      - nas_model.py
      - nas_layers_test.py
    - evolved_transformer_test.py
    - mtf_transformer_test.py
    - bytenet.py
    - image_transformer_2d_test.py
    - resnet.py
    - lstm_test.py
    - image_transformer.py
    - lstm.py
    - mtf_transformer2.py
    - mtf_image_transformer_test.py
    - xception_test.py
    - __init__.py
    - README.md
    - shake_shake.py
    - transformer_test.py
    - resnet_test.py
    - research
      - universal_transformer_util.py
      - rl.py
      - transformer_vae_flow_prior.py
      - moe_experiments.py
      - autoencoders.py
      - similarity_transformer.py
      - glow_test.py
      - transformer_moe.py
      - neural_stack_test.py
      - vqa_attention.py
      - transformer_aux.py
      - cycle_gan.py
      - multiquery_paper.py
      - autoencoders_test.py
      - gene_expression.py
      - glow_ops_test.py
      - attention_lm_moe.py
      - shuffle_network.py
      - transformer_vae_test.py
      - gene_expression_test.py
      - transformer_revnet.py
      - adafactor_experiments.py
      - transformer_nat.py
      - vqa_attention_test.py
      - glow_ops.py
      - transformer_revnet_test.py
      - transformer_vae.py
      - attention_lm.py
      - universal_transformer_test.py
      - moe.py
      - transformer_vae_flow_prior_ops.py
      - lm_experiments.py
      - transformer_symshard.py
      - super_lm.py
      - glow.py
      - glow_init_hook.py
      - transformer_sketch.py
      - __init__.py
      - residual_shuffle_exchange.py
      - universal_transformer.py
      - vqa_recurrent_self_attention.py
      - neural_stack.py
      - vqa_self_attention.py
      - aligned.py
      - transformer_parallel.py
      - transformer_aux_test.py
    - neural_gpu_test.py
    - revnet.py
    - image_transformer_test.py
    - vanilla_gan.py
    - mtf_transformer.py
    - video
      - nfg_conv3d_test.py
      - sv2p_test.py
      - savp.py
      - epva_params.py
      - emily.py
      - base_vae.py
      - nfg_conv_test.py
      - nfg_uncond_test.py
      - next_frame_glow.py
      - sv2p_params.py
      - basic_deterministic_test.py
      - emily_test.py
      - nfg_test_utils.py
      - basic_stochastic.py
      - basic_deterministic_params.py
      - basic_recurrent.py
      - sv2p.py
      - savp_params.py
      - nfg_conv_lstm_test.py
      - basic_recurrent_test.py
      - epva.py
      - savp_test.py
      - basic_stochastic_test.py
      - __init__.py
      - nfg_interpolate.py
      - basic_deterministic.py
      - base.py
      - tests_utils.py
  - rl
    - gym_utils_test.py
    - gym_utils.py
    - evaluator.py
    - trainer_model_free_tictactoe_test.py
    - evaluator_test.py
    - batch_runner_test.py
    - dopamine_connector.py
    - ppo.py
    - trainer_model_free.py
    - restarter.py
    - restarter_test.py
    - rl_utils.py
    - trainer_model_based.py
    - trainer_model_based_agent_only.py
    - batch_dqn_agent_test.py
    - trainer_model_based_params.py
    - trainer_model_free_test.py
    - player.py
    - trainer_model_based_recurrent_test.py
    - __init__.py
    - README.md
    - datagen_with_agent.py
    - player_utils.py
    - ppo_learner.py
    - trainer_model_based_sv2p_test.py
    - trainer_model_based_stochastic_test.py
    - policy_learner.py
    - trainer_model_based_test.py
    - envs
      - py_func_batch_env.py
      - simulated_batch_gym_env.py
      - tf_atari_wrappers.py
      - in_graph_batch_env.py
      - __init__.py
      - simulated_batch_env.py
  - test_data
    - example_usr_dir
      - my_submodule.py
      - __init__.py
      - requirements.txt
    - vocab.translate_ende_wmt8k.8192.subwords
    - transformer_test_ckpt
      - flags.txt
      - model.ckpt-1.index
      - hparams.json
      - checkpoint
      - model.ckpt-1.data-00000-of-00002
  - notebooks
    - asr_transformer.ipynb
    - t2t_problem.ipynb
    - Transformer_translate.ipynb
  - data_generators
    - wiki_lm.py
    - translate_test.py
    - translate_enmk.py
    - audio_encoder.py
    - snli.py
    - translate_enzh.py
    - mscoco_test.py
    - speech_recognition.py
    - conll_ner.py
    - audio_test.py
    - imdb.py
    - gym_env_test.py
    - algorithmic_math_deepmind.py
    - moving_mnist.py
    - timeseries_data_generator.py
    - desc2code.py
    - dialog_opensubtitles.py
    - function_docstring.py
    - wsj_parsing.py
    - tokenizer.py
    - cipher.py
    - gene_expression.py
    - allen_brain.py
    - wiki_multi_problems.py
    - wikitext103.py
    - mnist.py
    - celeba.py
    - librispeech.py
    - translate_ende.py
    - timeseries.py
    - translate_entn.py
    - gene_expression_test.py
    - ptb.py
    - problem_hparams.py
    - paraphrase_ms_coco_test.py
    - dna_encoder_test.py
    - dialog_abstract.py
    - translate_enid.py
    - text_encoder_test.py
    - translate_enes.py
    - lm1b.py
    - generator_utils.py
    - dna_encoder.py
    - cnn_dailymail.py
    - wnli.py
    - transduction_problems.py
    - style_transfer_test.py
    - fsns.py
    - mscoco.py
    - celeba_test.py
    - imagenet_test.py
    - babi_qa.py
    - desc2code_test.py
    - video_utils_test.py
    - vqa.py
    - text_encoder_build_subword.py
    - wiki_revision_utils.py
    - algorithmic_test.py
    - multi_problem_v2.py
    - tokenizer_test.py
    - video_generated.py
    - program_search.py
    - image_utils.py
    - mrpc.py
    - transduction_problems_test.py
    - multi_problem_v2_test.py
    - translate_enro.py
    - audio.py
    - ice_parsing.py
    - image_lsun.py
    - cleaner_en_xx.py
    - algorithmic_math.py
    - scitail.py
    - style_transfer.py
    - test_data
      - vocab-1.txt
      - vocab-2.txt
      - 1.csv
      - corpus-1.txt
      - corpus-2.txt
    - multi_problem.py
    - algorithmic.py
    - rte.py
    - allen_brain_test.py
    - lm1b_mnli.py
    - video_utils.py
    - dialog_personachat.py
    - wikifact
      - README.md
    - paraphrase_ms_coco.py
    - ocr.py
    - problem.py
    - dialog_cornell.py
    - wiki_revision.py
    - stanford_nli.py
    - subject_verb_agreement.py
    - gym_env.py
    - timeseries_data_generator_test.py
    - bair_robot_pushing.py
    - sst_binary.py
    - image_utils_test.py
    - text_problems.py
    - pointer_generator_word.py
    - translate_enet.py
    - generator_utils_test.py
    - __init__.py
    - text_problems_test.py
    - common_voice.py
    - README.md
    - google_robot_pushing.py
    - translate_ende_test.py
    - yelp_polarity.py
    - timeseries_test.py
    - text_encoder.py
    - program_search_test.py
    - ops
      - subword_text_encoder.cc
      - subword_text_encoder_ops.cc
      - testdata
        subwords
      - subword_text_encoder_test.cc
      - subword_text_encoder.h
      - pack_sequences_ops.cc
      - pack_sequences_ops_test.py
      - subword_text_encoder_ops_test.py
    - wikisum
      - delete_instances.sh
      - html.py
      - validate_data.py
      - wikisum.py
      - utils_test.py
      - parallel_launch.py
      - get_references_web_single_group.py
      - test_data
        para_good1.txt
        para_bad1.txt
      - get_references_commoncrawl.py
      - get_references_web.py
      - produce_examples.py
      - __init__.py
      - README.md
      - utils.py
      - generate_vocab.py
    - translate_envi.py
    - multinli.py
    - common_voice_test.py
    - problem_test.py
    - all_problems.py
    - algorithmic_math_test.py
    - celebahq.py
    - vqa_utils.py
    - translate.py
    - lm1b_imdb.py
    - dialog_dailydialog.py
    - inspect_tfrecord.py
    - cola.py
    - enwik8.py
    - imagenet.py
    - cifar.py
    - yelp_full.py
    - algorithmic_math_two_variables.py
    - lambada.py
    - wiki.py
    - qnli.py
    - translate_enfr.py
    - quora_qpairs.py
    - translate_encs.py
    - squad.py
  - __init__.py
  - visualization
    - visualization_test.py
    - attention.py
    - TransformerVisualization.ipynb
    - attention.js
    - __init__.py
    - visualization.py
  - bin
    - t2t-eval
    - t2t-trainer
    - t2t_trainer.py
    - t2t_translate_all.py
    - t2t-translate-all
    - t2t_prune.py
    - t2t_datagen.py
    - t2t-decoder
    - t2t-make-tf-configs
    - t2t-avg-all
    - build_vocab.py
    - make_tf_configs.py
    - t2t_avg_all.py
    - t2t_trainer_test.py
    - t2t_distill.py
    - t2t_attack.py
    - t2t-exporter
    - t2t-bleu
    - __init__.py
    - t2t_eval.py
    - t2t-insights-server
    - t2t_bleu.py
    - t2t_decoder.py
    - t2t-datagen
    - t2t-query-server
  - metrics
    - video_conditional_fvd_test.py
    - video_conditional_fvd.py
    - __init__.py
  - problems_test.py
  - problems.py
  - insights
    - query_processor.py
    - graph.py
    - insight_configuration.proto
    - server.py
    - polymer
      - explore_view
        explore-view.js
        explore-view.html
      - query_card
        query-card.js
        query-card.html
      - insights_app
        insights-app.html
        insights-app.js
      - language_selector
        language-selector-content.html
        language-selector-content.js
        language-selector.html
        language-selector.js
      - attention_visualization
        attention-visualization.js
        attention-visualization.html
      - processing_visualization
        processing-visualization.js
        processing-visualization.html
      - bower.json
      - translation_result
        translation-result.html
        translation-result.js
      - common-types.js
      - graph_visualization
        graph-visualization.html
        graph-visualization.js
      - .bowerrc
      - tensor2tensor.html
      - index.html
    - __init__.py
    - README.md
    - transformer_model.py
  - envs
    - time_step_test.py
    - mujoco_problems.py
    - env_problem_utils.py
    - trajectory.py
    - gym_env_problem.py
    - gym_env_problem_test.py
    - tic_tac_toe_env_test.py
    - tic_tac_toe_env_problem.py
    - time_step.py
    - trajectory_test.py
    - env_problem.py
    - tic_tac_toe_env.py
    - gym_spaces_utils_test.py
    - env_problem_utils_test.py
    - tic_tac_toe_env_problem_test.py
    - __init__.py
    - mujoco_problems_test.py
    - rendered_env_problem_test.py
    - rendered_env_problem.py
    - gym_spaces_utils.py
- LICENSE
- CONTRIBUTING.md
- ISSUE_TEMPLATE.md
- setup.py
- .travis.yml
- README.md
- floyd_requirements.txt
- pylintrc
- .gitignore
- docs
  - multi_problem.md
  - new_model.md
  - walkthrough.md
  - cloud_mlengine.md
  - index.md
  - overview.md
  - tutorials
    - asr_with_transformer.md
  - distributed_training.md
  - new_problem.md
  - cloud_tpu.md
- oss_scripts
  - oss_integration_test.sh
  - oss_pip_install.sh
  - oss_tests.sh
  - oss_release.sh

# coding=utf-8
# Copyright 2020 The Tensor2Tensor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""PPO learner."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math
import os

from tensor2tensor.layers import common_layers
from tensor2tensor.models.research.rl import get_policy
from tensor2tensor.rl import ppo
from tensor2tensor.rl.envs.tf_atari_wrappers import StackWrapper
from tensor2tensor.rl.envs.tf_atari_wrappers import WrapperBase
from tensor2tensor.rl.policy_learner import PolicyLearner
from tensor2tensor.rl.restarter import Restarter
from tensor2tensor.utils import trainer_lib

import tensorflow.compat.v1 as tf
import tensorflow_probability as tfp


class PPOLearner(PolicyLearner):
  """PPO for policy learning."""

  def __init__(self, frame_stack_size, base_event_dir, agent_model_dir,
               total_num_epochs, **kwargs):
    super(PPOLearner, self).__init__(
        frame_stack_size, base_event_dir, agent_model_dir, total_num_epochs)
    self._num_completed_iterations = 0
    self._lr_decay_start = None
    self._distributional_size = kwargs.get("distributional_size", 1)
    self._distributional_subscale = kwargs.get("distributional_subscale", 0.04)
    self._distributional_threshold = kwargs.get("distributional_threshold", 0.0)

  def train(self,
            env_fn,
            hparams,
            simulated,
            save_continuously,
            epoch,
            sampling_temp=1.0,
            num_env_steps=None,
            env_step_multiplier=1,
            eval_env_fn=None,
            report_fn=None,
            model_save_fn=None):
    assert sampling_temp == 1.0 or hparams.learning_rate == 0.0, \
        "Sampling with non-1 temperature does not make sense during training."

    if not save_continuously:
      # We do not save model, as that resets frames that we need at restarts.
      # But we need to save at the last step, so we set it very high.
      hparams.save_models_every_epochs = 1000000

    if simulated:
      simulated_str = "sim"
    else:
      simulated_str = "real"
    name_scope = "ppo_{}{}".format(simulated_str, epoch + 1)
    event_dir = os.path.join(self.base_event_dir, "ppo_summaries",
                             str(epoch) + simulated_str)

    with tf.Graph().as_default():
      with tf.name_scope(name_scope):
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
          env = env_fn(in_graph=True)
          (train_summary_op, eval_summary_op, initializers) = (
              _define_train(
                  env,
                  hparams,
                  eval_env_fn,
                  sampling_temp,
                  distributional_size=self._distributional_size,
                  distributional_subscale=self._distributional_subscale,
                  distributional_threshold=self._distributional_threshold,
                  epoch=epoch if simulated else -1,
                  frame_stack_size=self.frame_stack_size,
                  force_beginning_resets=simulated))

        if num_env_steps is None:
          iteration_increment = hparams.epochs_num
        else:
          iteration_increment = int(
              math.ceil(
                  num_env_steps / (env.batch_size * hparams.epoch_length)))
        iteration_increment *= env_step_multiplier

        self._num_completed_iterations += iteration_increment

        restarter = Restarter(
            "policy", self.agent_model_dir, self._num_completed_iterations
        )
        if restarter.should_skip:
          return

        if hparams.lr_decay_in_final_epoch:
          if epoch != self.total_num_epochs - 1:
            # Extend the warmup period to the end of this epoch.
            hparams.learning_rate_warmup_steps = restarter.target_global_step
          else:
            if self._lr_decay_start is None:
              # Stop the warmup at the beginning of this epoch.
              self._lr_decay_start = \
                  restarter.target_global_step - iteration_increment
            hparams.learning_rate_warmup_steps = self._lr_decay_start

        _run_train(
            hparams,
            event_dir,
            self.agent_model_dir,
            restarter,
            train_summary_op,
            eval_summary_op,
            initializers,
            epoch,
            report_fn=report_fn,
            model_save_fn=model_save_fn)

  def evaluate(self, env_fn, hparams, sampling_temp):
    with tf.Graph().as_default():
      with tf.name_scope("rl_eval"):
        eval_env = env_fn(in_graph=True)
        (collect_memory, _, collect_init) = _define_collect(
            eval_env,
            hparams,
            "ppo_eval",
            eval_phase=True,
            frame_stack_size=self.frame_stack_size,
            force_beginning_resets=False,
            sampling_temp=sampling_temp,
            distributional_size=self._distributional_size,
        )
        model_saver = tf.train.Saver(
            tf.global_variables(hparams.policy_network + "/.*")
            # tf.global_variables("clean_scope.*")  # Needed for sharing params.
        )

        with tf.Session() as sess:
          sess.run(tf.global_variables_initializer())
          collect_init(sess)
          trainer_lib.restore_checkpoint(self.agent_model_dir, model_saver,
                                         sess)
          sess.run(collect_memory)


def _define_train(
    train_env,
    ppo_hparams,
    eval_env_fn=None,
    sampling_temp=1.0,
    distributional_size=1,
    distributional_subscale=0.04,
    distributional_threshold=0.0,
    epoch=-1,
    **collect_kwargs
):
  """Define the training setup."""
  memory, collect_summary, train_initialization = (
      _define_collect(
          train_env,
          ppo_hparams,
          "ppo_train",
          eval_phase=False,
          sampling_temp=sampling_temp,
          distributional_size=distributional_size,
          **collect_kwargs))
  ppo_summary = ppo.define_ppo_epoch(
      memory, ppo_hparams, train_env.action_space, train_env.batch_size,
      distributional_size=distributional_size,
      distributional_subscale=distributional_subscale,
      distributional_threshold=distributional_threshold,
      epoch=epoch)
  train_summary = tf.summary.merge([collect_summary, ppo_summary])

  if ppo_hparams.eval_every_epochs:
    # TODO(koz4k): Do we need this at all?
    assert eval_env_fn is not None
    eval_env = eval_env_fn(in_graph=True)
    (_, eval_collect_summary, eval_initialization) = (
        _define_collect(
            eval_env,
            ppo_hparams,
            "ppo_eval",
            eval_phase=True,
            sampling_temp=0.0,
            distributional_size=distributional_size,
            **collect_kwargs))
    return (train_summary, eval_collect_summary, (train_initialization,
                                                  eval_initialization))
  else:
    return (train_summary, None, (train_initialization,))


def _run_train(ppo_hparams,
               event_dir,
               model_dir,
               restarter,
               train_summary_op,
               eval_summary_op,
               initializers,
               epoch,
               report_fn=None,
               model_save_fn=None):
  """Train."""
  summary_writer = tf.summary.FileWriter(
      event_dir, graph=tf.get_default_graph(), flush_secs=60)

  model_saver = tf.train.Saver(
      tf.global_variables(ppo_hparams.policy_network + "/.*") +
      tf.global_variables("training/" + ppo_hparams.policy_network + "/.*") +
      # tf.global_variables("clean_scope.*") +  # Needed for sharing params.
      tf.global_variables("global_step") +
      tf.global_variables("losses_avg.*") +
      tf.global_variables("train_stats.*")
  )

  global_step = tf.train.get_or_create_global_step()
  with tf.control_dependencies([tf.assign_add(global_step, 1)]):
    train_summary_op = tf.identity(train_summary_op)

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for initializer in initializers:
      initializer(sess)
    trainer_lib.restore_checkpoint(model_dir, model_saver, sess)

    num_target_iterations = restarter.target_local_step
    num_completed_iterations = num_target_iterations - restarter.steps_to_go
    with restarter.training_loop():
      for epoch_index in range(num_completed_iterations, num_target_iterations):
        summary = sess.run(train_summary_op)
        if summary_writer:
          summary_writer.add_summary(summary, epoch_index)

        if (ppo_hparams.eval_every_epochs and
            epoch_index % ppo_hparams.eval_every_epochs == 0):
          eval_summary = sess.run(eval_summary_op)
          if summary_writer:
            summary_writer.add_summary(eval_summary, epoch_index)
          if report_fn:
            summary_proto = tf.Summary()
            summary_proto.ParseFromString(eval_summary)
            for elem in summary_proto.value:
              if "mean_score" in elem.tag:
                report_fn(elem.simple_value, epoch_index)
                break

        if (model_saver and ppo_hparams.save_models_every_epochs and
            (epoch_index % ppo_hparams.save_models_every_epochs == 0 or
             (epoch_index + 1) == num_target_iterations)):
          ckpt_name = "model.ckpt-{}".format(
              tf.train.global_step(sess, global_step)
          )
          # Keep the last checkpoint from each epoch in a separate directory.
          epoch_dir = os.path.join(model_dir, "epoch_{}".format(epoch))
          tf.gfile.MakeDirs(epoch_dir)
          for ckpt_dir in (model_dir, epoch_dir):
            model_saver.save(sess, os.path.join(ckpt_dir, ckpt_name))
          if model_save_fn:
            model_save_fn(model_dir)


def _rollout_metadata(batch_env, distributional_size=1):
  """Metadata for rollouts."""
  batch_env_shape = batch_env.observ.get_shape().as_list()
  batch_size = [batch_env_shape[0]]
  value_size = batch_size
  if distributional_size > 1:
    value_size = batch_size + [distributional_size]
  shapes_types_names = [
      # TODO(piotrmilos): possibly retrieve the observation type for batch_env
      (batch_size + batch_env_shape[1:], batch_env.observ_dtype, "observation"),
      (batch_size, tf.float32, "reward"),
      (batch_size, tf.bool, "done"),
      (batch_size + list(batch_env.action_shape), batch_env.action_dtype,
       "action"),
      (batch_size, tf.float32, "pdf"),
      (value_size, tf.float32, "value_function"),
  ]
  return shapes_types_names


class _MemoryWrapper(WrapperBase):
  """Memory wrapper."""

  def __init__(self, batch_env):
    super(_MemoryWrapper, self).__init__(batch_env)
    infinity = 10000000
    meta_data = list(zip(*_rollout_metadata(batch_env)))
    # In memory wrapper we do not collect pdfs neither value_function
    # thus we only need the first 4 entries of meta_data
    shapes = meta_data[0][:4]
    dtypes = meta_data[1][:4]
    self.speculum = tf.FIFOQueue(infinity, shapes=shapes, dtypes=dtypes)
    observs_shape = batch_env.observ.shape
    # TODO(piotrmilos): possibly retrieve the observation type for batch_env
    self._observ = tf.Variable(
        tf.zeros(observs_shape, self.observ_dtype), trainable=False)

  def __str__(self):
    return "MemoryWrapper(%s)" % str(self._batch_env)

  def simulate(self, action):

    # There is subtlety here. We need to collect data
    # obs, action = policy(obs), done, reward = env(abs, action)
    # Thus we need to enqueue data before assigning new observation

    reward, done = self._batch_env.simulate(action)

    with tf.control_dependencies([reward, done]):
      enqueue_op = self.speculum.enqueue(
          [self._observ.read_value(), reward, done, action])

    with tf.control_dependencies([enqueue_op]):
      assign = self._observ.assign(self._batch_env.observ)

    with tf.control_dependencies([assign]):
      return tf.identity(reward), tf.identity(done)


def _define_collect(batch_env, ppo_hparams, scope, frame_stack_size, eval_phase,
                    sampling_temp, force_beginning_resets,
                    distributional_size=1):
  """Collect trajectories.

  Args:
    batch_env: Batch environment.
    ppo_hparams: PPO hparams, defined in tensor2tensor.models.research.rl.
    scope: var scope.
    frame_stack_size: Number of last observations to feed into the policy.
    eval_phase: TODO(koz4k): Write docstring.
    sampling_temp: Sampling temperature for the policy.
    force_beginning_resets: Whether to reset at the beginning of each episode.
    distributional_size: optional, number of buckets in distributional RL.

  Returns:
    Returns memory (observations, rewards, dones, actions,
    pdfs, values_functions)
    containing a rollout of environment from nested wrapped structure.
  """
  epoch_length = ppo_hparams.epoch_length

  to_initialize = []
  with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
    num_agents = batch_env.batch_size

    to_initialize.append(batch_env)
    wrappers = [(StackWrapper, {
        "history": frame_stack_size
    }), (_MemoryWrapper, {})]
    rollout_metadata = None
    speculum = None
    for w in wrappers:
      tf.logging.info("Applying wrapper %s(%s) to env %s." % (str(
          w[0]), str(w[1]), str(batch_env)))
      batch_env = w[0](batch_env, **w[1])
      to_initialize.append(batch_env)

    rollout_metadata = _rollout_metadata(batch_env, distributional_size)
    speculum = batch_env.speculum

    def initialization_lambda(sess):
      for batch_env in to_initialize:
        batch_env.initialize(sess)

    memory = [
        tf.get_variable(  # pylint: disable=g-complex-comprehension
            "collect_memory_%d_%s" % (epoch_length, name),
            shape=[epoch_length] + shape,
            dtype=dtype,
            initializer=tf.zeros_initializer(),
            trainable=False) for (shape, dtype, name) in rollout_metadata
    ]

    cumulative_rewards = tf.get_variable(
        "cumulative_rewards", len(batch_env), trainable=False)

    eval_phase_t = tf.convert_to_tensor(eval_phase)
    should_reset_var = tf.Variable(True, trainable=False)
    zeros_tensor = tf.zeros(len(batch_env))

  force_beginning_resets = tf.convert_to_tensor(force_beginning_resets)

  def reset_ops_group():
    return tf.group(
        batch_env.reset(tf.range(len(batch_env))),
        tf.assign(cumulative_rewards, zeros_tensor))

  reset_op = tf.cond(
      tf.logical_or(should_reset_var.read_value(), force_beginning_resets),
      reset_ops_group, tf.no_op)

  with tf.control_dependencies([reset_op]):
    reset_once_op = tf.assign(should_reset_var, False)

  with tf.control_dependencies([reset_once_op]):

    def step(index, scores_sum, scores_num):
      """Single step."""
      index %= epoch_length  # Only needed in eval runs.
      # Note - the only way to ensure making a copy of tensor is to run simple
      # operation. We are waiting for tf.copy:
      # https://github.com/tensorflow/tensorflow/issues/11186
      obs_copy = batch_env.observ + 0
      value_fun_shape = (num_agents,)
      if distributional_size > 1:
        value_fun_shape = (num_agents, distributional_size)

      def env_step(arg1, arg2, arg3):  # pylint: disable=unused-argument
        """Step of the environment."""

        (logits, value_function) = get_policy(
            obs_copy, ppo_hparams, batch_env.action_space, distributional_size
        )
        action = common_layers.sample_with_temperature(logits, sampling_temp)
        action = tf.cast(action, tf.int32)
        action = tf.reshape(action, shape=(num_agents,))

        reward, done = batch_env.simulate(action)

        pdf = tfp.distributions.Categorical(logits=logits).prob(action)
        pdf = tf.reshape(pdf, shape=(num_agents,))
        value_function = tf.reshape(value_function, shape=value_fun_shape)
        done = tf.reshape(done, shape=(num_agents,))

        with tf.control_dependencies([reward, done]):
          return tf.identity(pdf), tf.identity(value_function), \
                 tf.identity(done)

      # TODO(piotrmilos): while_body is executed at most once,
      # thus should be replaced with tf.cond
      pdf, value_function, top_level_done = tf.while_loop(
          lambda _1, _2, _3: tf.equal(speculum.size(), 0),
          env_step,
          [
              tf.constant(0.0, shape=(num_agents,)),
              tf.constant(0.0, shape=value_fun_shape),
              tf.constant(False, shape=(num_agents,))
          ],
          parallel_iterations=1,
          back_prop=False,
      )

      with tf.control_dependencies([pdf, value_function]):
        obs, reward, done, action = speculum.dequeue()
        to_save = [obs, reward, done, action, pdf, value_function]
        save_ops = [
            tf.scatter_update(memory_slot, index, value)
            for memory_slot, value in zip(memory, to_save)
        ]
        cumulate_rewards_op = cumulative_rewards.assign_add(reward)

        agent_indices_to_reset = tf.where(top_level_done)[:, 0]
      with tf.control_dependencies([cumulate_rewards_op]):
        # TODO(piotrmilos): possibly we need cumulative_rewards.read_value()
        scores_sum_delta = tf.reduce_sum(
            tf.gather(cumulative_rewards.read_value(), agent_indices_to_reset))
        scores_num_delta = tf.count_nonzero(done, dtype=tf.int32)
      with tf.control_dependencies(save_ops +
                                   [scores_sum_delta, scores_num_delta]):
        reset_env_op = batch_env.reset(agent_indices_to_reset)
        reset_cumulative_rewards_op = tf.scatter_update(
            cumulative_rewards, agent_indices_to_reset,
            tf.gather(zeros_tensor, agent_indices_to_reset))
      with tf.control_dependencies([reset_env_op, reset_cumulative_rewards_op]):
        return [
            index + 1, scores_sum + scores_sum_delta,
            scores_num + scores_num_delta
        ]

    def stop_condition(i, _, resets):
      return tf.cond(eval_phase_t, lambda: resets < num_agents,
                     lambda: i < epoch_length)

    init = [tf.constant(0), tf.constant(0.0), tf.constant(0)]
    index, scores_sum, scores_num = tf.while_loop(
        stop_condition, step, init, parallel_iterations=1, back_prop=False)

  # We handle force_beginning_resets differently. We assume that all envs are
  # reseted at the end of episod (though it happens at the beginning of the
  # next one
  scores_num = tf.cond(force_beginning_resets,
                       lambda: scores_num + len(batch_env), lambda: scores_num)

  with tf.control_dependencies([scores_sum]):
    scores_sum = tf.cond(
        force_beginning_resets,
        lambda: scores_sum + tf.reduce_sum(cumulative_rewards.read_value()),
        lambda: scores_sum)

  mean_score = tf.cond(
      tf.greater(scores_num, 0),
      lambda: scores_sum / tf.cast(scores_num, tf.float32), lambda: 0.)
  printing = tf.Print(0, [mean_score, scores_sum, scores_num], "mean_score: ")
  with tf.control_dependencies([index, printing]):
    memory = [mem.read_value() for mem in memory]
    # When generating real data together with PPO training we must use single
    # agent. For PPO to work we reshape the history, as if it was generated
    # by real_ppo_effective_num_agents.
    if ppo_hparams.effective_num_agents is not None and not eval_phase:
      new_memory = []
      effective_num_agents = ppo_hparams.effective_num_agents
      assert epoch_length % ppo_hparams.effective_num_agents == 0, (
          "The rollout of ppo_hparams.epoch_length will be distributed amongst"
          "effective_num_agents of agents")
      new_epoch_length = int(epoch_length / effective_num_agents)
      for mem, info in zip(memory, rollout_metadata):
        shape, _, name = info
        new_shape = [effective_num_agents, new_epoch_length] + shape[1:]
        perm = list(range(len(shape) + 1))
        perm[0] = 1
        perm[1] = 0
        mem = tf.transpose(mem, perm=perm)
        mem = tf.reshape(mem, shape=new_shape)
        mem = tf.transpose(
            mem,
            perm=perm,
            name="collect_memory_%d_%s" % (new_epoch_length, name))
        new_memory.append(mem)
      memory = new_memory

    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
      mean_score_summary = tf.cond(
          tf.greater(scores_num, 0),
          lambda: tf.summary.scalar("mean_score_this_iter", mean_score), str)
      summaries = tf.summary.merge([
          mean_score_summary,
          tf.summary.scalar("episodes_finished_this_iter", scores_num)
      ])
      return memory, summaries, initialization_lambda