python source code of model

# Copyright 2017 reinforce.io. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================


"""
The `Model` class coordinates the creation and execution of all TensorFlow operations within a model.
It implements the `reset`, `act` and `update` functions, which give the interface the `Agent` class
communicates with, and which should not need to be overwritten. Instead, the following TensorFlow
functions need to be implemented:

* `tf_actions_and_internals(states, internals, deterministic)` returning the batch of
   actions and successor internal states.
* `tf_loss_per_instance(states, internals, actions, terminal, reward)` returning the loss
   per instance for a batch.

Further, the following TensorFlow functions should be extended accordingly:

* `initialize(custom_getter)` defining TensorFlow placeholders/functions and adding internal states.
* `get_variables()` returning the list of TensorFlow variables (to be optimized) of this model.
* `tf_regularization_losses(states, internals)` returning a dict of regularization losses.
* `get_optimizer_kwargs(states, internals, actions, terminal, reward)` returning a dict of potential
   arguments (argument-free functions) to the optimizer.

Finally, the following TensorFlow functions can be useful in some cases:

* `preprocess_states(states)` for state preprocessing, returning the processed batch of states.
* `action_exploration(action, exploration, action_spec)` for action postprocessing (e.g. exploration), returning the processed batch of actions.
* `preprocess_reward(states, internals, terminal, reward)` for reward preprocessing (e.g. reward normalization), returning the processed batch of rewards.
* `create_output_operations(states, internals, actions, terminal, reward, deterministic)` for further output operations, similar to the two above for `Model.act` and `Model.update`.
* `tf_optimization(states, internals, actions, terminal, reward)` for further optimization operations (e.g. the baseline update in a `PGModel` or the target network update in a `QModel`), returning a single grouped optimization operation.
"""

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

from copy import deepcopy
import os

import numpy as np
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.utils import saved_model_export_utils
from tensorflow.contrib.session_bundle.exporter import Exporter
from tensorflow.contrib.session_bundle.exporter import generic_signature
from tensorflow.python.platform import gfile

from tensorforce import TensorforceError, util
from tensorforce.core.explorations import Exploration
from tensorforce.core.optimizers import Optimizer, GlobalOptimizer
from tensorforce.core.preprocessing import PreprocessorStack
from tensorforce.core.optimizers.lr_decay import tf_schedules


class Model(object):
    """
    Base class for all (TensorFlow-based) models.
    """

    def __init__(
        self,
        states_spec,
        actions_spec,
        device,
        session_config,
        scope,
        saver_spec,
        summary_spec,
        distributed_spec,
        optimizer,
        discount,
        variable_noise,
        states_preprocessing_spec,
        explorations_spec,
        reward_preprocessing_spec,
        batch_data
    ):
        # States and actions specifications
        self.states_spec = states_spec
        self.actions_spec = actions_spec

        # I/O specification
        self.io_spec = None
        if batch_data is not None:
            self.io_spec = dict(table=True)
            if isinstance(batch_data, dict):
                self.io_spec['tensor'] = True
                if 'states' in batch_data and 'actions' not in batch_data:
                    self.io_spec['interactive'] = True

        # TensorFlow device and scope
        self.device = device
        self.session_config = session_config
        self.scope = scope

        # Saver/summary/distributed specifications
        self.saver_spec = saver_spec
        self.summary_spec = summary_spec
        self.distributed_spec = distributed_spec

        # TensorFlow summaries
        if summary_spec is None:
            self.summary_labels = set()
        else:
            self.summary_labels = set(summary_spec.get('labels', ()))

        # Optimizer
        self.optimizer = optimizer

        # Discount factor
        self.discount = discount

        # Variable noise
        assert variable_noise is None or variable_noise > 0.0
        self.variable_noise = variable_noise

        # Preprocessing and exploration
        self.states_preprocessing_spec = states_preprocessing_spec
        self.explorations_spec = explorations_spec
        self.reward_preprocessing_spec = reward_preprocessing_spec

        # Setup TensorFlow graph and session
        self.setup(batch_data)

    def setup(self, batch_data):
        """
        Sets up the TensorFlow model graph and initializes the TensorFlow session.
        """
        default_graph = None
        if self.distributed_spec is None:
            self.global_model = None
            self.graph = tf.get_default_graph()# consistent with the distributed mode
            default_graph = self.graph.as_default()
            default_graph.__enter__()

        elif self.distributed_spec.get('ps'):
            if self.distributed_spec.get('replica_model'):
                raise TensorforceError("Invalid config value for distributed mode.")
            self.server = tf.train.Server(
                server_or_cluster_def=self.distributed_spec['cluster_spec'],
                job_name='ps',
                task_index=self.distributed_spec['task_index'],
                protocol=self.distributed_spec.get('protocol'),
                config=None,
                start=True
            )
            # Param server does nothing actively
            self.server.join()
            return

        elif self.distributed_spec.get('replica_model'):
            self.device = tf.train.replica_device_setter(
                worker_device=self.device,
                cluster=self.distributed_spec['cluster_spec']
            )
            self.global_model = None
            # Replica model is part of its parent model's graph, hence no new graph here.
            self.graph = tf.get_default_graph()

        else:
            # So that haoye can construct table reader before the instantiation of the 'model'
            # class without passing his graph into this code block
            graph = tf.get_default_graph()
            default_graph = graph.as_default()
            default_graph.__enter__()
            # Global model.
            self.global_model = deepcopy(self)
            self.global_model.distributed_spec['replica_model'] = True
            self.global_model.setup(batch_data)
            self.graph = graph

        with tf.device(device_name_or_function=self.device):

            # Variables and summaries
            self.variables = dict()
            self.all_variables = dict()
            self.registered_variables = set()
            self.summaries = list()

            def custom_getter(getter, name, registered=False, second=False, **kwargs):
                if registered:
                    self.registered_variables.add(name)
                elif name in self.registered_variables:
                    registered = True
                variable = getter(name=name, **kwargs)  # Top-level, hence no 'registered'
                if not registered:
                    self.all_variables[name] = variable
                    if kwargs.get('trainable', True) and not name.startswith('optimization'):
                        self.variables[name] = variable
                        if 'variables' in self.summary_labels:
                            summary = tf.summary.histogram(name=name, values=variable)
                            self.summaries.append(summary)
                return variable

            # Episode
            collection = self.graph.get_collection(name='episode')
            if len(collection) == 0:
                self.episode = tf.Variable(
                    name='episode',
                    dtype=util.tf_dtype('int'),
                    trainable=False,
                    initial_value=0
                )
                self.graph.add_to_collection(name='episode', value=self.episode)
            else:
                assert len(collection) == 1
                self.episode = collection[0]

            # Timestep
            collection = self.graph.get_collection(name='timestep')
            if len(collection) == 0:
                self.timestep = tf.Variable(
                    name='timestep',
                    dtype=util.tf_dtype('int'),
                    trainable=False,
                    initial_value=0
                )
                self.graph.add_to_collection(name='timestep', value=self.timestep)
                self.graph.add_to_collection(name=tf.GraphKeys.GLOBAL_STEP, value=self.timestep)
            else:
                assert len(collection) == 1
                self.timestep = collection[0]

            if self.distributed_spec is not None:
                self.given_sync_value = tf.placeholder(
                    dtype=util.tf_dtype('bool'),
                    shape=(),
                    name='given_sync_value'
                )

                collection = self.graph.get_collection(name='distributed_sync')
                if len(collection) == 0:
                    self.sync_flags = tf.Variable(
                        name='sync_flags',
                        dtype=util.tf_dtype('bool'),
                        trainable=False,
                        initial_value=self.distributed_spec.get("cluster_spec").num_tasks("worker") * [False]
                    )
                    self.graph.add_to_collection(name='distributed_sync', value=self.sync_flags)
                else:
                    assert len(collection) == 1
                    self.sync_flags = collection[0]


            # Create I/O tensors if necessary
            if (self.io_spec is not None and self.io_spec.get("table")) and (self.distributed_spec is None or
                self.distributed_spec.get('replica_model')):
                self.batch_data = batch_data
            elif self.global_model:
                self.batch_data = self.global_model.batch_data
            else:
                # Set batch_data to None when data is not loaded from "odps table"
                self.batch_data = None

            # Create placeholders, tf functions, internals, etc
            self.initialize(custom_getter=custom_getter)

            # Input tensors
            states = {name: tf.identity(input=state) for name, state in self.states_input.items()}
            states = self.fn_preprocess_states(states=states)
            states = {name: tf.stop_gradient(input=state) for name, state in states.items()}
            internals = [tf.identity(input=internal) for internal in self.internals_input]
            actions = {name: tf.identity(input=action) for name, action in self.actions_input.items()}
            terminal = tf.identity(input=self.terminal_input)
            reward = tf.identity(input=self.reward_input)
            reward = self.fn_preprocess_reward(states=states, internals=internals, terminal=terminal, reward=reward)
            reward = tf.stop_gradient(input=reward)

            # Optimizer
            kwargs_opt = dict()
            kwargs_opt['summaries']         =self.summaries
            kwargs_opt['summary_labels']    =self.summary_labels
            if self.optimizer is None:
                pass
            else:
                # Propertional learning rate decay
                self.optimizer = tf_schedules.add_lr_decay(spec=self.optimizer, global_step=self.timestep)
                if self.distributed_spec is not None and \
                        not self.distributed_spec.get('parameter_server') and \
                        not self.distributed_spec.get('replica_model'):
                    # If not internal global model
                    self.optimizer = GlobalOptimizer(optimizer=self.optimizer)
                else:
                    # Create optimizer instance
                    self.optimizer = Optimizer.from_spec(spec=self.optimizer, kwargs=kwargs_opt)

            # Create output fetch operations
            self.create_output_operations(
                states=states,
                internals=internals,
                actions=actions,
                terminal=terminal,
                reward=reward,
                update=self.update_input,
                deterministic=self.deterministic_input
            )

            for name, action in self.actions_output.items():
                if name in self.explorations:
                    self.actions_output[name] = tf.cond(
                        pred=self.deterministic_input,
                        true_fn=(lambda: action),
                        false_fn=(lambda: self.fn_action_exploration(
                            action=action,
                            exploration=self.explorations[name],
                            action_spec=self.actions_spec[name]
                        ))
                    )

            if any(k in self.summary_labels for k in ['inputs','states','actions','rewards']):
                if any(k in self.summary_labels for k in ['inputs','states']):
                    for name, state in states.items():
                        summary = tf.summary.histogram(name=(self.scope + '/inputs/states/' + name), values=state)
                        self.summaries.append(summary)
                if any(k in self.summary_labels for k in ['inputs','actions']):
                    for name, action in actions.items():
                        summary = tf.summary.histogram(name=(self.scope + '/inputs/actions/' + name), values=action)
                        self.summaries.append(summary)
                if any(k in self.summary_labels for k in ['inputs','rewards']):
                    summary = tf.summary.histogram(name=(self.scope + '/inputs/rewards'), values=reward)
                    self.summaries.append(summary)

        if self.distributed_spec is not None and self.distributed_spec.get('replica_model'):
            return

        # Global and local variables initialize operations
        if self.distributed_spec is None:
            global_variables = self.get_variables(include_non_trainable=True) + [self.episode, self.timestep]
            init_op = tf.variables_initializer(var_list=global_variables)
            ready_op = tf.report_uninitialized_variables(var_list=global_variables)
            # TODO TensorFlow template hotfix
            global_variables = list(set(global_variables))

            ready_for_local_init_op = None
            local_init_op = None

            if self.io_spec and self.io_spec.get("table"):
                odps_io_vars_table = tf.contrib.framework.get_variables(scope="table_env",
                                                                  collection=tf.GraphKeys.LOCAL_VARIABLES)
                odps_io_vars_oss = tf.contrib.framework.get_variables(scope="oss_env",
                                                                  collection=tf.GraphKeys.LOCAL_VARIABLES)
                odps_io_vars = odps_io_vars_table + odps_io_vars_oss
                ready_for_local_init_op = tf.report_uninitialized_variables(var_list=global_variables)
                local_init_op = tf.group(*([v.initializer for v in odps_io_vars]))

        else:
            if self.io_spec and self.io_spec.get("table"):
                odps_io_vars_table = tf.contrib.framework.get_variables(scope="table_env",
                                                                  collection=tf.GraphKeys.LOCAL_VARIABLES)
                odps_io_vars_oss = tf.contrib.framework.get_variables(scope="oss_env",
                                                                  collection=tf.GraphKeys.LOCAL_VARIABLES)
                odps_io_vars = odps_io_vars_table + odps_io_vars_oss
            else:
                odps_io_vars = list()
            global_variables = self.global_model.get_variables(include_non_trainable=True) + [self.episode, self.timestep] + ([self.sync_flags] if self.distributed_spec else [])
            local_variables = self.get_variables(include_non_trainable=True) + [self.episode, self.timestep] + ([self.sync_flags] if self.distributed_spec else [])
            init_op = tf.variables_initializer(var_list=global_variables)
            ready_op = tf.report_uninitialized_variables(var_list=(global_variables + local_variables))
            ready_for_local_init_op = tf.report_uninitialized_variables(var_list=global_variables)
            local_init_op = tf.group(*([local_var.assign(value=global_var) for local_var, global_var in zip(
                local_variables, global_variables)] + [v.initializer for v in odps_io_vars]))


        def init_fn(scaffold, session):
            if self.saver_spec is not None and self.saver_spec.get('load', True):
                directory = self.saver_spec['directory']
                file = self.saver_spec.get('file')
                if file is None:
                    file = tf.train.latest_checkpoint(
                        checkpoint_dir=directory,
                        # Corresponds to argument of saver.save() in Model.save().
                        latest_filename=None
                    )
                elif not os.path.isfile(file):
                    file = os.path.join(directory, file)
                if file is not None:
                    scaffold.saver.restore(sess=session, save_path=file)

        # Summary operation
        summaries = self.get_summaries()
        if len(summaries) > 0:
            summary_op = tf.summary.merge(inputs=summaries)
        else:
            summary_op = None

        # TensorFlow saver object
        saver = tf.train.Saver(
            var_list=global_variables,  # should be given?
            reshape=False,
            sharded=False,  # should be true?
            max_to_keep=5,
            keep_checkpoint_every_n_hours=10000.0,
            name=None,
            restore_sequentially=False,
            saver_def=None,
            builder=None,
            defer_build=False,
            allow_empty=True,
            write_version=tf.train.SaverDef.V2,
            pad_step_number=False,
            save_relative_paths=True
            # filename=None
        )

        # TensorFlow scaffold object
        self.scaffold = tf.train.Scaffold(
            init_op=init_op,
            init_feed_dict=None,
            init_fn=init_fn,
            ready_op=ready_op,
            ready_for_local_init_op=ready_for_local_init_op,
            local_init_op=local_init_op,
            summary_op=summary_op,
            saver=saver,
            copy_from_scaffold=None
        )

        hooks = list()

        # Checkpoint saver hook
        if self.saver_spec is not None and (self.distributed_spec is None or self.distributed_spec['task_index'] == 0):
            self.saver_directory = self.saver_spec['directory']
            hooks.append(tf.train.CheckpointSaverHook(
                checkpoint_dir=self.saver_directory,
                save_secs=self.saver_spec.get('seconds', None if 'steps' in self.saver_spec else 600),
                save_steps=self.saver_spec.get('steps'),  # Either one or the other has to be set.
                saver=None,  # None since given via 'scaffold' argument.
                checkpoint_basename=self.saver_spec.get('basename', 'model.ckpt'),
                scaffold=self.scaffold,
                listeners=None
            ))
        else:
            self.saver_directory = None

        # Summary saver hook
        if self.summary_spec is None:
            self.summary_writer_hook = None
        else:
            # TensorFlow summary writer object
            self.summary_writer = tf.summary.FileWriter(
                logdir=self.summary_spec['directory'],
                graph=self.graph,
                max_queue=10,
                flush_secs=120,
                filename_suffix=None
            )
            self.summary_writer_hook = util.UpdateSummarySaverHook(
                update_input=self.update_input,
                save_steps=self.summary_spec.get('steps'),  # Either one or the other has to be set.
                save_secs=self.summary_spec.get('seconds', None if 'steps' in self.summary_spec else 120),
                output_dir=None,  # None since given via 'summary_writer' argument.
                summary_writer=self.summary_writer,
                scaffold=self.scaffold,
                summary_op=None  # None since given via 'scaffold' argument.
            )
            hooks.append(self.summary_writer_hook)

        if self.distributed_spec is None:
            # TensorFlow non-distributed monitored session object
            self.monitored_session = tf.train.SingularMonitoredSession(
                hooks=hooks,
                scaffold=self.scaffold,
                # Default value.
                master='',
                # always the same?
                config=self.session_config,
                checkpoint_dir=None
            )

        else:
            self.server = tf.train.Server(
                server_or_cluster_def=self.distributed_spec['cluster_spec'],
                job_name='worker',
                task_index=self.distributed_spec['task_index'],
                protocol=self.distributed_spec.get('protocol'),
                config=self.session_config,
                start=True
            )

            if self.distributed_spec['task_index'] == 0:
                # TensorFlow chief session creator object
                session_creator = tf.train.ChiefSessionCreator(
                    scaffold=self.scaffold,
                    master=self.server.target,
                    config=self.session_config,
                    checkpoint_dir=None,
                    checkpoint_filename_with_path=None
                )
            else:
                # TensorFlow worker session creator object
                session_creator = tf.train.WorkerSessionCreator(
                    scaffold=self.scaffold,
                    master=self.server.target,
                    config=self.session_config,
                )

            # TensorFlow monitored session object
            self.monitored_session = tf.train.MonitoredSession(
                session_creator=session_creator,
                hooks=hooks,
                stop_grace_period_secs=120  # Default value.
            )

        if default_graph:
            default_graph.__exit__(None, None, None)
        self.graph.finalize()
        self.monitored_session.__enter__()
        self.session = self.monitored_session._tf_sess()

    def get_session(self):
        return self.monitored_session

    def close(self, reset_graph=True):
        if self.saver_directory is not None and (self.distributed_spec is None or self.distributed_spec.get("task_index")==0):
            path = self.save(append_timestep=True)
            print("saved the model to %s" % path)

        self.monitored_session.close()

        if self.saver_spec is not None and self.saver_spec.get('export_directory') is not None and (self.distributed_spec is None or self.distributed_spec.get("task_index")==0) and (self.io_spec is None or self.io_spec.get("tensor") is not True):
            if self.saver_spec.get("saved_model"):
                path = self.export()
                print("exported saved_model to %s" % path)
            else:
                path = self.export_session_bundle()
                print("exported session bundle to %s" % path)

        if reset_graph:
            tf.reset_default_graph()

    def initialize(self, custom_getter):
        """
        Creates the TensorFlow placeholders and functions for this model. Moreover adds the
        internal state placeholders and initialization values to the model.

        Args:
            custom_getter: The `custom_getter_` object to use for `tf.make_template` when creating TensorFlow functions.
        """

        # States preprocessing
        self.states_preprocessing = dict()
        if self.states_preprocessing_spec is None:
            for name, state in self.states_spec.items():
                state['processed_shape'] = state['shape']
        elif isinstance(self.states_preprocessing_spec, list):
            for name, state in self.states_spec.items():
                preprocessing = PreprocessorStack.from_spec(spec=self.states_preprocessing_spec)
                self.states_preprocessing[name] = preprocessing
                state['processed_shape'] = preprocessing.processed_shape(shape=state['shape'])
        else:
            for name, state in self.states_spec.items():
                if self.states_preprocessing_spec.get(name) is not None:
                    preprocessing = PreprocessorStack.from_spec(spec=self.states_preprocessing_spec[name])
                    self.states_preprocessing[name] = preprocessing
                    state['processed_shape'] = preprocessing.processed_shape(shape=state['shape'])
                else:
                    state['processed_shape'] = state['shape']

        # States
        if self.io_spec and self.io_spec.get("tensor"):
            self.states_input = self.batch_data["states"]
        else:
            self.states_input = dict()
            for name, state in self.states_spec.items():
                self.states_input[name] = tf.placeholder(
                    dtype=util.tf_dtype(state['type']),
                    shape=(None,) + tuple(state['shape']),
                    name=name
                )

        # Actions
        if self.io_spec and self.io_spec.get("tensor") and not self.io_spec.get("interactive"):
            self.actions_input = self.batch_data["actions"]
        else:
            self.actions_input = dict()
            for name, action in self.actions_spec.items():
                self.actions_input[name] = tf.placeholder(
                    dtype=util.tf_dtype(action['type']),
                    shape=(None,) + tuple(action['shape']),
                    name=name
                )

        # Explorations
        self.explorations = dict()
        if self.explorations_spec is None:
            pass
        elif isinstance(self.explorations_spec, list):
            for name, state in self.actions_spec.items():
                self.explorations[name] = Exploration.from_spec(spec=self.explorations_spec)
        # single spec for all components of our action space
        elif "type" in self.explorations_spec:
            for name, state in self.actions_spec.items():
                self.explorations[name] = Exploration.from_spec(spec=self.explorations_spec)
        # different spec for different components of our action space
        else:
            for name, state in self.actions_spec.items():
                if self.explorations_spec.get(name) is not None:
                    self.explorations[name] = Exploration.from_spec(spec=self.explorations_spec[name])

        # Terminal
        if self.io_spec and self.io_spec.get("tensor") and not self.io_spec.get("interactive"):
            self.terminal_input = self.batch_data["terminal"]
        else:
            self.terminal_input = tf.placeholder(dtype=util.tf_dtype('bool'), shape=(None,), name='terminal')

        # Reward preprocessing
        if self.reward_preprocessing_spec is None:
            self.reward_preprocessing = None
        else:
            self.reward_preprocessing = PreprocessorStack.from_spec(spec=self.reward_preprocessing_spec)
            if self.reward_preprocessing.processed_shape(shape=()) != ():
                raise TensorforceError("Invalid reward preprocessing!")

        # Reward
        if self.io_spec and self.io_spec.get("tensor") and not self.io_spec.get("interactive"):
            self.reward_input = self.batch_data["reward"]
        else:
            self.reward_input = tf.placeholder(dtype=util.tf_dtype('float'), shape=(None,), name='reward')

        # Internal states
        self.internals_input = list()
        self.internals_init = list()

        # Deterministic action flag
        self.deterministic_input = tf.placeholder(dtype=util.tf_dtype('bool'), shape=(), name='deterministic')

        # Update flag
        self.update_input = tf.placeholder(dtype=util.tf_dtype('bool'), shape=(), name='update')

        # TensorFlow functions
        self.fn_discounted_cumulative_reward = tf.make_template(
            name_=(self.scope + '/discounted-cumulative-reward'),
            func_=self.tf_discounted_cumulative_reward,
            custom_getter_=custom_getter
        )
        self.fn_actions_and_internals = tf.make_template(
            name_=(self.scope + '/actions-and-internals'),
            func_=self.tf_actions_and_internals,
            custom_getter_=custom_getter
        )
        self.fn_loss_per_instance = tf.make_template(
            name_=(self.scope + '/loss-per-instance'),
            func_=self.tf_loss_per_instance,
            custom_getter_=custom_getter
        )
        self.fn_regularization_losses = tf.make_template(
            name_=(self.scope + '/regularization-losses'),
            func_=self.tf_regularization_losses,
            custom_getter_=custom_getter
        )
        self.fn_loss = tf.make_template(
            name_=(self.scope + '/loss'),
            func_=self.tf_loss,
            custom_getter_=custom_getter
        )
        self.fn_optimization = tf.make_template(
            name_=(self.scope + '/optimization'),
            func_=self.tf_optimization,
            custom_getter_=custom_getter
        )
        self.fn_preprocess_states = tf.make_template(
            name_=(self.scope + '/preprocess-states'),
            func_=self.tf_preprocess_states,
            custom_getter_=custom_getter
        )
        self.fn_action_exploration = tf.make_template(
            name_=(self.scope + '/action-exploration'),
            func_=self.tf_action_exploration,
            custom_getter_=custom_getter
        )
        self.fn_preprocess_reward = tf.make_template(
            name_=(self.scope + '/preprocess-reward'),
            func_=self.tf_preprocess_reward,
            custom_getter_=custom_getter
        )

        self.summary_configuration_op = None
        if self.summary_spec and 'meta_param_recorder_class' in self.summary_spec:
            self.summary_configuration_op = self.summary_spec['meta_param_recorder_class'].build_metagraph_list()

        # self.fn_summarization = tf.make_template(
        #     name_='summarization',
        #     func_=self.tf_summarization,
        #     custom_getter_=custom_getter
        # )

    def tf_preprocess_states(self, states):
        """
        Applies optional pre-processing to the states.
        """
        for name, state in states.items():
            if name in self.states_preprocessing:
                states[name] = self.states_preprocessing[name].process(tensor=state)
            else:
                states[name] = tf.identity(input=state)

        return states

    def tf_action_exploration(self, action, exploration, action_spec):
        """
        Applies optional exploration to the action.
        """
        action_shape = tf.shape(input=action)
        exploration_value = exploration.tf_explore(
            episode=self.episode,
            timestep=self.timestep,
            action_shape=action_shape
        )

        if action_spec['type'] == 'bool':
            action = tf.where(
                condition=(tf.random_uniform(shape=action_shape[0]) < exploration_value),
                x=(tf.random_uniform(shape=action_shape) < 0.5),
                y=action
            )

        elif action_spec['type'] == 'int':
            action = tf.where(
                condition=(tf.random_uniform(shape=action_shape) < exploration_value),
                x=tf.random_uniform(shape=action_shape, maxval=action_spec['num_actions'], dtype=util.tf_dtype('int')),
                y=action
            )

        elif action_spec['type'] == 'float':
            action += tf.reshape(tensor=exploration_value, shape=tuple(1 for _ in range(action_shape.get_shape().as_list()[0])))
            if 'min_value' in action_spec:
                action = tf.clip_by_value(
                    t=action,
                    clip_value_min=action_spec['min_value'],
                    clip_value_max=action_spec['max_value']
                )

        return action

    def tf_preprocess_reward(self, states, internals, terminal, reward):
        """
        Applies optional pre-processing to the reward.
        """
        if self.reward_preprocessing is None:
            reward = tf.identity(input=reward)
        else:
            reward = self.reward_preprocessing.process(tensor=reward)

        return reward

    def tf_discounted_cumulative_reward(self, terminal, reward, discount, final_reward=0.0):
        """
        Creates the TensorFlow operations for calculating the discounted cumulative rewards
        for a given sequence of rewards.

        Args:
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            discount: Discount factor.
            final_reward: Last reward value in the sequence.

        Returns:
            Discounted cumulative reward tensor.
        """

        # TODO: n-step cumulative reward (particularly for envs without terminal)

        def cumulate(cumulative, reward_and_terminal):
            rew, term = reward_and_terminal
            return tf.where(
                condition=term,
                x=rew,
                y=(rew + cumulative * discount)
            )

        # Reverse since reward cumulation is calculated right-to-left, but tf.scan only works left-to-right
        reward = tf.reverse(tensor=reward, axis=(0,))
        terminal = tf.reverse(tensor=terminal, axis=(0,))

        reward = tf.scan(fn=cumulate, elems=(reward, terminal), initializer=final_reward)

        return tf.reverse(tensor=reward, axis=(0,))

    def tf_actions_and_internals(self, states, internals, update, deterministic):
        """
        Creates the TensorFlow operations for retrieving the actions (and posterior internal states)
        in reaction to the given input states (and prior internal states).

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            update: Boolean tensor indicating whether this call happens during an update.
            deterministic: Boolean tensor indicating whether action should be chosen
                deterministically.

        Returns:
            Actions and list of posterior internal state tensors.
        """
        raise NotImplementedError

    def tf_loss_per_instance(self, states, internals, actions, terminal, reward, update):
        """
        Creates the TensorFlow operations for calculating the loss per batch instance
        of the given input states and actions.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            Loss tensor.
        """
        raise NotImplementedError

    def tf_regularization_losses(self, states, internals, update):
        """
        Creates the TensorFlow operations for calculating the regularization losses for the given input states.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            Dict of regularization loss tensors.
        """
        return dict()

    def tf_loss(self, states, internals, actions, terminal, reward, update):
        # Mean loss per instance
        loss_per_instance = self.fn_loss_per_instance(
            states=states,
            internals=internals,
            actions=actions,
            terminal=terminal,
            reward=reward,
            update=update
        )
        loss = tf.reduce_mean(input_tensor=loss_per_instance, axis=0)

        # Loss without regularization summary
        if 'losses' in self.summary_labels:
            summary = tf.summary.scalar(name='loss-without-regularization', tensor=loss)
            self.summaries.append(summary)

        # Regularization losses
        losses = self.fn_regularization_losses(states=states, internals=internals, update=update)
        if len(losses) > 0:
            loss += tf.add_n(inputs=list(losses.values()))
            if 'regularization' in self.summary_labels:
                for name, loss_val in losses.items():
                    summary = tf.summary.scalar(name="regularization/" + name, tensor=loss_val)
                    self.summaries.append(summary)

        # Total loss summary
        if 'losses' in self.summary_labels or 'total-loss' in self.summary_labels:
            summary = tf.summary.scalar(name='total-loss', tensor=loss)
            self.summaries.append(summary)

        return loss

    def get_optimizer_kwargs(self, states, internals, actions, terminal, reward, update):
        """
        Returns the optimizer arguments including the time, the list of variables to optimize,
        and various argument-free functions (in particular `fn_loss` returning the combined
        0-dim batch loss tensor) which the optimizer might require to perform an update step.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            Loss tensor of the size of the batch.
        """
        kwargs = dict()
        kwargs['time'] = self.timestep
        kwargs['variables'] = self.get_variables()
        kwargs['fn_loss'] = (
            lambda: self.fn_loss(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, update=update)
        )
        if self.global_model is not None:
            kwargs['global_variables'] = self.global_model.get_variables()
        return kwargs

    def tf_optimization(self, states, internals, actions, terminal, reward, update):
        """
        Creates the TensorFlow operations for performing an optimization update step based
        on the given input states and actions batch.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.

        Returns:
            The optimization operation.
        """
        if self.optimizer is None:
            return tf.no_op()
        else:
            optimizer_kwargs = self.get_optimizer_kwargs(
                states=states,
                internals=internals,
                actions=actions,
                terminal=terminal,
                reward=reward,
                update=update
            )
            return self.optimizer.minimize(**optimizer_kwargs)

    def create_output_operations(self, states, internals, actions, terminal, reward, update, deterministic):
        """
        Calls all the relevant TensorFlow functions for this model and hence creates all the
        TensorFlow operations involved.

        Args:
            states: Dict of state tensors.
            internals: List of prior internal state tensors.
            actions: Dict of action tensors.
            terminal: Terminal boolean tensor.
            reward: Reward tensor.
            update: Boolean tensor indicating whether this call happens during an update.
            deterministic: Boolean tensor indicating whether action should be chosen
                deterministically.
        """

        # Create graph by calling the functions corresponding to model.act() / model.update(), to initialize variables.
        # TODO: Could call reset here, but would have to move other methods below reset.
        self.fn_actions_and_internals(
            states=states,
            internals=internals,
            update=update,
            deterministic=deterministic
        )
        self.fn_loss_per_instance(
            states=states,
            internals=internals,
            actions=actions,
            terminal=terminal,
            reward=reward,
            update=update
        )

        # Tensor fetched for model.act()
        operations = list()
        if self.variable_noise is not None and self.variable_noise > 0.0:
            # Add variable noise
            noise_deltas = list()
            for variable in self.get_variables():
                noise_delta = tf.random_normal(shape=util.shape(variable), mean=0.0, stddev=self.variable_noise)
                noise_deltas.append(noise_delta)
                operations.append(variable.assign_add(delta=noise_delta))

        # Retrieve actions and internals
        with tf.control_dependencies(control_inputs=operations):
            self.actions_output, self.internals_output = self.fn_actions_and_internals(
                states=states,
                internals=internals,
                update=update,
                deterministic=deterministic
            )

        # Increment timestep
        increment_timestep = tf.shape(input=next(iter(states.values())))[0]
        increment_timestep = self.timestep.assign_add(delta=increment_timestep)
        operations = [increment_timestep]

        # Subtract variable noise
        if self.variable_noise is not None and self.variable_noise > 0.0:
            for variable, noise_delta in zip(self.get_variables(), noise_deltas):
                operations.append(variable.assign_sub(delta=noise_delta))

        with tf.control_dependencies(control_inputs=operations):
            # Trivial operation to enforce control dependency
            self.timestep_output = self.timestep + 0

        # Tensor fetched for model.observe()
        increment_episode = tf.count_nonzero(input_tensor=terminal, dtype=util.tf_dtype('int'))
        increment_episode = self.episode.assign_add(delta=increment_episode)
        with tf.control_dependencies(control_inputs=(increment_episode,)):
            self.increment_episode = self.episode + 0
        # TODO: add up rewards per episode and add summary_label 'episode-reward'

        # Tensor(s) fetched for model.update()
        self.optimization = self.fn_optimization(
            states=states,
            internals=internals,
            actions=actions,
            terminal=terminal,
            reward=reward,
            update=update
        )
        self.loss_per_instance = self.fn_loss_per_instance(
            states=states,
            internals=internals,
            actions=actions,
            terminal=terminal,
            reward=reward,
            update=update
        )

        if self.distributed_spec is not None:
            self.set_sync_flag = tf.assign(ref=self.sync_flags[self.distributed_spec.get("task_index")], value=self.given_sync_value)
            self.is_synced = tf.reduce_all(tf.equal(self.sync_flags, self.given_sync_value))

    def get_variables(self, include_non_trainable=False):
        """
        Returns the TensorFlow variables used by the model.

        Returns:
            List of variables.
        """

        # optimizer variables and timestep/episode only included if 'include_non_trainable' set
        if include_non_trainable:
            model_variables = [self.all_variables[key] for key in sorted(self.all_variables)]
            states_preprocessing_variables = [
                variable for name in self.states_preprocessing.keys()
                for variable in self.states_preprocessing[name].get_variables()
            ]
            explorations_variables = [
                variable for name in self.explorations.keys()
                for variable in self.explorations[name].get_variables()
            ]
            if self.reward_preprocessing is not None:
                reward_preprocessing_variables = self.reward_preprocessing.get_variables()
            else:
                reward_preprocessing_variables = list()
            if self.optimizer is None:
                optimizer_variables = list()
            else:
                optimizer_variables = self.optimizer.get_variables()

            variables = model_variables
            variables.extend([v for v in states_preprocessing_variables if v not in variables])
            variables.extend([v for v in explorations_variables if v not in variables])
            variables.extend([v for v in reward_preprocessing_variables if v not in variables])
            variables.extend([v for v in optimizer_variables if v not in variables])

            return variables

        else:
            return [self.variables[key] for key in sorted(self.variables)]

    def get_summaries(self):
        """
        Returns the TensorFlow summaries reported by the model

        Returns:
            List of summaries
        """
        return self.summaries

    def sync(self, sync_value):
        if self.distributed_spec is not None:
            self.monitored_session.run(self.set_sync_flag, feed_dict={self.given_sync_value:sync_value})
            is_synced = self.monitored_session.run(self.is_synced, feed_dict={self.given_sync_value:sync_value})
            while not is_synced:
                is_synced = self.monitored_session.run(self.is_synced, feed_dict={self.given_sync_value:sync_value})
            print("synced")

    def reset(self):
        """
        Resets the model to its initial state on episode start.

        Returns:
            Current episode and timestep counter, and a list containing the internal states
            initializations.
        """
        # TODO preprocessing reset call moved from agent
        episode, timestep = self.monitored_session.run(fetches=(self.episode, self.timestep))
        return episode, timestep, list(self.internals_init)

    def act(self, states, internals, deterministic=False):
        fetches = [self.actions_output, self.internals_output, self.timestep_output]

        feed_dict = {}

        if self.io_spec and self.io_spec.get("tensor") and self.io_spec.get("interactive"):
            batched = True
            feed_dict.update({internal_input: internals[n] for n, internal_input in enumerate(self.internals_input)})
        else:
            name = next(iter(self.states_spec))
            batched = (np.asarray(states[name]).ndim != len(self.states_spec[name]['shape']))
            if batched:
                feed_dict.update({state_input: states[name] for name, state_input in self.states_input.items()})
                feed_dict.update({internal_input: internals[n] for n, internal_input in enumerate(self.internals_input)})
            else:
                feed_dict.update({state_input: (states[name],) for name, state_input in self.states_input.items()})
                feed_dict.update({internal_input: (internals[n],) for n, internal_input in enumerate(self.internals_input)})

        feed_dict[self.deterministic_input] = deterministic
        feed_dict[self.update_input] = False

        actions, internals, timestep = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict)

        if not batched:
            actions = {name: action[0] for name, action in actions.items()}
            internals = [internal[0] for internal in internals]

        if self.summary_configuration_op is not None:
            summary_values = self.session.run(self.summary_configuration_op)
            self.summary_writer.add_summary(summary_values)
            self.summary_writer.flush()
            # Only do this operation once to reduce duplicate data in Tensorboard
            self.summary_configuration_op = None

        return actions, internals, timestep

    def observe(self, terminal, reward):
        fetches = self.increment_episode

        terminal = np.asarray(terminal)
        batched = (terminal.ndim == 1)
        if batched:
            feed_dict = {self.terminal_input: terminal, self.reward_input: reward, }
        else:
            feed_dict = {self.terminal_input: (terminal,), self.reward_input: (reward,)}

        feed_dict[self.update_input] = False

        episode = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict)

        return episode

    def update(self, states, internals, actions, terminal, reward, return_loss_per_instance=False):
        """
        model update entry, must be implemented in child classes.
        """

        raise NotImplementedError


    def save(self, directory=None, append_timestep=True):
        """
        Save TensorFlow model. If no checkpoint directory is given, the model's default saver
        directory is used. Optionally appends current timestep to prevent overwriting previous
        checkpoint files. Turn off to be able to load model from the same given path argument as
        given here.

        Args:
            directory: Optional checkpoint directory.
            append_timestep: Appends the current timestep to the checkpoint file if true.

        Returns:
            Checkpoint path were the model was saved.
        """
        if self.summary_writer_hook is not None:
            self.summary_writer_hook._summary_writer.flush()

        return self.scaffold.saver.save(
            sess=self.session,
            save_path=(self.saver_directory if directory is None else directory),
            global_step=(self.timestep if append_timestep else None),
            # latest_filename=None,  # Defaults to 'checkpoint'.
            meta_graph_suffix='meta',
            write_meta_graph=True,
            write_state=True
        )

    def signature(self):
        # jones TO DO: internals_input is also necessary for computing actions_output if lstm network is used
        #              in this case, no state can be named as 'deterministic'/'update'...
        inputs = dict([(name, tf.saved_model.utils.build_tensor_info(ts)) for name, ts in self.states_input.items()] +
                        [('deterministic', tf.saved_model.utils.build_tensor_info(self.deterministic_input))] +
                        [('update', tf.saved_model.utils.build_tensor_info(self.update_input))]
                     )
        self.states_input_var_name = dict([(name, inputs[name].name) for name in self.states_input.keys()])
        self.deterministic_input_var_name = inputs['deterministic'].name
        self.update_input_var_name = inputs['update'].name

        outputs = dict([(name, tf.saved_model.utils.build_tensor_info(ts)) for name, ts in self.actions_output.items()])
        self.actions_output_var_name = dict([(name, tsinfo.name) for name, tsinfo in outputs.items()])

        return tf.saved_model.signature_def_utils.build_signature_def(
            inputs=inputs,
            outputs=outputs,
            method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)

    def export(self):
        '''
        export the saved_model to oss bucket
        please call this method after the monitored session has been closed
        '''
        export_dir_base = self.saver_spec.get('export_directory')
        if not export_dir_base:
            print("export_directory is None")

        checkpoint = tf.train.latest_checkpoint(self.saver_directory)
        if not checkpoint:
            raise NotFittedError("Couldn't find trained model at %s." % self.saver_directory)

        export_dir = saved_model_export_utils.get_timestamped_export_dir(export_dir_base)
        temp_export_dir = export_dir
        builder = tf.saved_model.builder.SavedModelBuilder(temp_export_dir)

        signature_def_map = {"predict_actions":self.signature()}

        if self.distributed_spec:
            sess = tf.Session(target=self.server.target, graph=self.graph, config=self.session_config)
        else:
            sess = tf.Session(graph=self.graph)
        self.scaffold.saver.restore(sess, checkpoint)
        builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=signature_def_map, clear_devices=True)

        builder.save()

        return export_dir

    def export_session_bundle(self):
        export_dir_base = self.saver_spec.get('export_directory')
        if not export_dir_base:
            print("export_directory is None")

        checkpoint = tf.train.latest_checkpoint(self.saver_directory)
        if not checkpoint:
            raise NotFittedError("Couldn't find trained model at %s." % self.saver_directory)

        export_dir = saved_model_export_utils.get_timestamped_export_dir(export_dir_base)

        if self.distributed_spec:
            sess = tf.Session(target=self.server.target, graph=self.graph, config=self.session_config)
        else:
            sess = tf.Session(graph=self.graph)
        self.scaffold.saver.restore(sess, checkpoint)

        signature = {name: ts for name, ts in self.states_input.items()}
        signature["deterministic"] = self.deterministic_input
        signature["update"] = self.update_input

        exporter = Exporter(self.scaffold.saver)
        exporter.init(self.graph.as_graph_def(),
                      clear_devices=True,
                      default_graph_signature=generic_signature(signature))
        exporter.export(export_dir_base=export_dir,
                        global_step_tensor=self.timestep,
                        sess=sess)

        return export_dir

    def load(self, path):
        '''
        load the saved_model and make inference with it
        '''
        if self.distributed_spec:
            self.session = tf.Session(self.server.target, config=self.session_config, graph=tf.Graph())
        else:
            self.session = tf.Session(graph=tf.Graph())
        self.session.__enter__()

        tf.saved_model.loader.load(self.session, [tf.saved_model.tag_constants.SERVING], path)

        self.graph = tf.get_default_graph()

        self.states_input = dict([(name, self.graph.get_tensor_by_name(var_name))  for name, var_name in self.states_input_var_name.items()])
        self.deterministic_input = self.graph.get_tensor_by_name(self.deterministic_input_var_name)
        self.update_input = self.graph.get_tensor_by_name(self.update_input_var_name)
        self.actions_output = dict([(name, self.graph.get_tensor_by_name(var_name)) for name, var_name in self.actions_output_var_name.items()])

    def predict(self, states, deterministic=True, update=False):
        feed_dict = {state_input: states[name] for name, state_input in self.states_input.items()}
        feed_dict[self.deterministic_input] = deterministic
        feed_dict[self.update_input] = update

        return self.session.run(self.actions_output, feed_dict)

    def restore(self, directory=None, file=None):
        """
        Restore TensorFlow model. If no checkpoint file is given, the latest checkpoint is
        restored. If no checkpoint directory is given, the model's default saver directory is
        used (unless file specifies the entire path).

        Args:
            directory: Optional checkpoint directory.
            file: Optional checkpoint file, or path if directory not given.
        """
        if file is None:
            file = tf.train.latest_checkpoint(
                checkpoint_dir=(self.saver_directory if directory is None else directory),
            )
        elif directory is None:
            file = os.path.join(self.saver_directory, file)
        elif not os.path.isfile(file):
            file = os.path.join(directory, file)

        self.scaffold.saver.restore(sess=self.session, save_path=file)