python source code of dqn_loss

rlgraph-master
- readthedocs.yaml
- examples
  - impala_openai_gym_with_lstm.py
  - ppo_cartpole.py
  - impala_distributed_dmlab.py
  - actor_critic_cartpole.py
  - impala_cartpole.py
  - sac_pendulum.py
  - dqn_cartpole_with_tf_summaries.py
  - distributed_ppo_pendulum.py
  - configs
    - ppo_mlagents_walker.json
    - sac_pendulum.json
    - dqn_cartpole.json
    - impala_cartpole.json
    - impala_distributed_dmlab.json
    - impala_openai_gym_with_lstm.json
    - impala_distributed_clusterspec.json
    - ppo_mlagents_banana_collector.json
    - ppo_mlagents_3dball_hard.json
    - sac_mlagents_3dball_hard.json
    - actor_critic_cartpole.json
    - sac_mlagents_banana_collector.json
    - distributed_ppo_pendulum.json
    - apex_pong.json
    - ppo_cartpole.json
  - apex_pong.py
  - train_agent_openai.py
  - ppo_or_sac_on_mlagents.py
- LICENSE
- rlgraph
  - graphs
    - tensorflow_executor.py
    - graph_builder.py
    - graph_executor.py
    - meta_graph.py
    - meta_graph_builder.py
    - __init__.py
    - pytorch_executor.py
  - utils
    - decorators.py
    - debug_util.py
    - util.py
    - op_records.py
    - model_util.py
    - ops.py
    - config_util.py
    - define_by_run_ops.py
    - specifiable.py
    - specifiable_server.py
    - numpy.py
    - __init__.py
    - initializer.py
    - visualization_util.py
    - rlgraph_errors.py
    - tf_util.py
    - pytorch_util.py
    - input_parsing.py
  - version.py
  - components
    - action_adapters
      - action_adapter.py
      - bernoulli_distribution_adapter.py
      - beta_distribution_adapter.py
      - normal_distribution_adapter.py
      - __init__.py
      - categorical_distribution_adapter.py
      - action_adapter_utils.py
      - gumbel_softmax_distribution_adapter.py
      - squashed_normal_distribution_adapter.py
    - layers
      - layer.py
      - nn
        maxpool2d_layer.py
        residual_layer.py
        nn_layer.py
        conv2d_layer.py
        conv2d_transpose_layer.py
        __init__.py
        local_response_normalization_layer.py
        activation_functions.py
        multi_lstm_layer.py
        concat_layer.py
        dense_layer.py
        lstm_layer.py
      - strings
        embedding_lookup.py
        __init__.py
        string_to_hash_bucket.py
        string_layer.py
      - __init__.py
      - preprocessing
        moving_standardize.py
        rank_reinterpreter.py
        image_resize.py
        normalize.py
        preprocess_layer.py
        clip.py
        image_binary.py
        grayscale.py
        convert_type.py
        multiply_divide.py
        container_splitter.py
        __init__.py
        image_crop.py
        transpose.py
        concat.py
        reshape.py
        sequence.py
    - policies
      - policy.py
      - dueling_policy.py
      - __init__.py
      - shared_value_function_policy.py
      - dynamic_batching_policy.py
    - memories
      - replay_memory.py
      - fifo_queue.py
      - ring_buffer.py
      - queue_runner.py
      - mem_prioritized_replay.py
      - prioritized_replay.py
      - memory.py
      - __init__.py
    - distributions
      - beta.py
      - mixture_distribution.py
      - multivariate_normal.py
      - joint_cumulative_distribution.py
      - normal.py
      - bernoulli.py
      - distribution.py
      - gumbel_softmax.py
      - squashed_normal.py
      - __init__.py
      - categorical.py
    - helpers
      - softmax.py
      - sequence_helper.py
      - v_trace_function.py
      - mem_segment_tree.py
      - dynamic_batching.py
      - generalized_advantage_estimation.py
      - __init__.py
      - clipping.py
      - batcher.cc
      - segment_tree.py
    - models
      - supervised_model.py
      - model.py
      - __init__.py
      - intrinsic_curiosity_world_option_model.py
    - common
      - noise_components.py
      - sampler.py
      - container_merger.py
      - staging_area.py
      - decay_components.py
      - repeater_stack.py
      - batch_apply.py
      - softmax.py
      - synchronizable.py
      - batch_splitter.py
      - multi_gpu_synchronizer.py
      - slice.py
      - time_dependent_parameters.py
      - iterative_optimization.py
      - __init__.py
      - environment_stepper.py
    - neural_networks
      - variational_auto_encoder.py
      - sac
        sac_networks.py
        __init__.py
      - value_function.py
      - impala
        impala_networks.py
        __init__.py
      - actor_component.py
      - dict_preprocessor_stack.py
      - __init__.py
      - stack.py
      - multi_input_stream_neural_network.py
      - preprocessor_stack.py
      - neural_network.py
    - queues
      - __init__.py
    - component.py
    - __init__.py
    - loss_functions
      - categorical_cross_entropy_loss.py
      - dqn_loss_function.py
      - dqfd_loss_function.py
      - euclidian_distance_loss.py
      - supervised_loss_function.py
      - neg_log_likelihood_loss.py
      - container_loss_function.py
      - actor_critic_loss_function.py
      - __init__.py
      - sac_loss_function.py
      - impala_loss_function.py
      - ppo_loss_function.py
      - loss_function.py
    - optimizers
      - optimizer.py
      - horovod_optimizer.py
      - __init__.py
      - local_optimizers.py
    - explorations
      - exploration.py
      - __init__.py
      - epsilon_exploration.py
  - agents
    - dqn_agent.py
    - apex_agent.py
    - random_agent.py
    - dqfd_agent.py
    - __init__.py
    - ppo_agent.py
    - agent.py
    - impala_agents.py
    - sac_agent.py
    - actor_critic_agent.py
  - execution
    - environment_sample.py
    - ray
      - ray_util.py
      - ray_policy_worker.py
      - ray_actor.py
      - ray_executor.py
      - __init__.py
      - sync_batch_executor.py
      - ray_value_worker.py
      - apex
        ray_memory_actor.py
        apex_executor.py
        __init__.py
        apex_memory.py
    - single_threaded_worker.py
    - distributed_tf
      - impala
        __init__.py
        impala_worker.py
      - __init__.py
    - __init__.py
    - worker.py
  - __init__.py
  - tests
    - dummy_components.py
    - components
      - test_reshape_preprocessor.py
      - test_stack.py
      - test_noise_components.py
      - test_multi_input_stream_nn.py
      - test_prioritized_replay.py
      - test_neural_networks.py
      - test_batch_apply.py
      - test_batch_splitter.py
      - test_ring_buffer.py
      - test_dqn_loss_functions.py
      - test_container_splitter.py
      - test_time_dependent_parameters.py
      - test_string_layers.py
      - test_slice.py
      - test_container_merger.py
      - test_ppo_loss_functions.py
      - images
        4x4x3_image_resized.bmp
        16x16x3_image.bmp
        8x12x3_image_cropped.bmp
      - test_sac_loss_function.py
      - test_v_trace_function.py
      - test_policies.py
      - test_preprocess_layers.py
      - test_softmax.py
      - test_dict_preprocessor_stack.py
      - test_nn_layers.py
      - test_policies_on_container_actions.py
      - test_impala_loss_function.py
      - test_actor_components.py
      - test_supervised_loss_functions.py
      - test_synchronizable.py
      - test_fifo_queue.py
      - test_generalized_advantage_estimation.py
      - test_explorations.py
      - test_sequence_preprocessor.py
      - __init__.py
      - test_neural_networks_keras_style_assembly.py
      - test_local_optimizers.py
      - test_environment_stepper.py
      - test_decay_components.py
      - test_variational_auto_encoders.py
      - test_epsilon_exploration.py
      - test_python_prioritized_replay.py
      - test_action_adapters.py
      - test_sampler_component.py
      - test_distributions.py
      - test_replay_memory.py
      - test_component_copy.py
      - test_staging_area.py
      - test_sequence_helper.py
      - test_preprocessor_stacks.py
    - test_util.py
    - agent_test.py
    - agent_learning
      - short_tasks
        test_ppo_agent_short_task_learning.py
        test_dqn_agent_short_task_learning.py
        __init__.py
        test_impala_agent_short_task_learning.py
        test_sac_agent_short_task_learning.py
        test_actor_critic_agent_short_task_learning.py
      - __init__.py
      - long_tasks
        test_impala_agent_long_task_learning.py
        __init__.py
        test_dqn_agent_long_task_learning.py
        test_apex_agent_long_task_learning.py
    - execution
      - test_apex_executor.py
      - test_gpu_strategies.py
      - test_single_threaded_worker.py
      - __init__.py
      - test_sync_batch_executor.py
      - test_ray_value_worker.py
      - test_ray_policy_worker.py
    - dummy_components_with_sub_components.py
    - configs
      - multi_gpu_dqn_for_2x2_gridworld.json
      - multi_gpu_ppo_for_2x2_gridworld.json
      - impala_agent_for_2x2_gridworld.json
      - test_lstm_nn.json
      - sync_batch_ppo_cartpole.json
      - sac_agent_for_pong.json
      - apex_agent_for_random_env.json
      - dqn_agent_for_4x4_gridworld.json
      - apex_agent_for_2x2_gridworld_with_container_actions.json
      - test_3x_cnn_nn.json
      - test_vae_encoder_network.json
      - impala_agent_for_breakout.json
      - sac_agent_for_functionality_test.json
      - backend_performance_dqn_pong.json
      - sac_agent_for_pendulum.json
      - actor_critic_agent_for_cartpole.json
      - test_simple_nn.json
      - ppo_agent_for_2x2_gridworld.json
      - sac_agent_for_gaussian_density_env.json
      - dqn_agent_for_2x2_gridworld_single_to_container.json
      - dqfd_agent_for_cartpole.json
      - multi_gpu_ray_apex_for_pong.json
      - apex_agent_for_2x2_gridworld.json
      - actor_critic_agent_for_2x2_gridworld.json
      - dqn_agent_for_2x2_gridworld.json
      - ppo_agent_for_long_chain_gridworld.json
      - sac_agent_for_2x2_gridworld_with_container_actions.json
      - actor_critic_agent_for_pong.json
      - dqn_agent_for_cartpole.json
      - backend_performance_dqn_cartpole.json
      - sac_cartpole_on_apex.json
      - dqn_agent_for_functionality_test.json
      - impala_agent_for_cartpole.json
      - ppo_agent_for_4_room_gridworld.json
      - test_complex_multi_stream_nn_with_lstm.json
      - dqn_pytorch_test.json
      - dqn_agent_for_random_env.json
      - ppo_agent_for_random_env_with_container_spaces.json
      - test_vae_decoder_network.json
      - sync_batch_ppo_gridworld_with_container_actions.json
      - multi_gpu_dqn_for_random_env.json
      - ppo_agent_for_cartpole.json
      - dqn_vector_env.json
      - ppo_agent_for_2x2_gridworld_with_container_actions.json
      - ray_sac_pong_test.json
      - sac_component_for_fake_env_test.json
      - ppo_agent_for_pendulum.json
      - ray_apex_for_pong.json
      - dqfd_container.json
      - sac_agent_for_cartpole.json
      - dqn_agent_for_2x2_gridworld_with_container_actions.json
      - sync_batch_ppo_pendulum.json
      - test_dense_to_lstm_nn.json
      - dqn_agent_for_pong.json
      - test_large_dense_nn.json
      - ppo_agent_for_4x4_gridworld.json
      - test_lrelu_nn.json
      - ppo_agent_for_pong.json
      - impala_agent_for_deepmind_lab_env.json
      - apex_agent_cartpole.json
      - impala_agent_for_random_env.json
    - __init__.py
    - visualization
      - test_visualizations.py
      - __init__.py
    - core
      - test_api_methods.py
      - test_device_placements.py
      - test_input_incomplete_build.py
      - test_spaces.py
      - test_single_components.py
      - test_input_space_checking.py
      - test_specifiables.py
      - test_pytorch_backend.py
      - test_graph_fns.py
      - __init__.py
      - test_pytorch_util.py
      - test_specifiable_server.py
    - environments
      - test_deepmind_lab.py
      - test_random_env.py
      - test_openai_gym_atari.py
      - test_readme_example.py
      - test_grid_world.py
      - test_ml_agents_env.py
      - test_deterministic_env.py
      - __init__.py
      - test_sequential_vector_env.py
    - component_test.py
    - performance
      - test_python_memory_performance.py
      - test_time_rank_folding_performance.py
      - test_backends.py
      - test_single_threaded_dqn.py
      - test_multi_gpu_updates.py
      - __init__.py
      - test_vector_env.py
      - test_tf_memory_performance.py
    - agent_functionality
      - test_apex_agent_functionality.py
      - test_all_compile.py
      - test_sac_agent_functionality.py
      - test_base_agent_functionality.py
      - __init__.py
      - test_impala_agent_functionality.py
      - test_dqfd_agent_functionality.py
      - test_ppo_agent_functionality.py
      - test_dqn_agent_functionality.py
  - spaces
    - bool_box.py
    - float_box.py
    - int_box.py
    - box_space.py
    - text_box.py
    - containers.py
    - space.py
    - __init__.py
    - space_utils.py
  - environments
    - mlagents_env.py
    - random_env.py
    - grid_world.py
    - deepmind_lab.py
    - images
    - sequential_vector_env.py
    - gaussian_density_as_reward_env.py
    - deterministic_env.py
    - openai_gym.py
    - __init__.py
    - vizdoom.py
    - environment.py
    - vector_env.py
- setup.py
- docker
  - travis
    - Dockerfile
  - minimal
    - Dockerfile
  - full_from_tf
    - Dockerfile
  - README.md
  - full
    - Dockerfile
- FAQ.md
- .travis.yml
- README.md
- RELEASE_NOTES.md
- .gitignore
- Jenkinsfile
- docs
  - rlgraphs_testing_system.rst
  - components.rst
  - Makefile
  - make.bat
  - intro.rst
  - spaces.rst
  - images
    - image_ppt.pptx
  - environments.rst
  - agents.rst
  - op_records_and_data_ops.rst
  - reference
    - components
      - explorations_reference.rst
      - neural_networks_reference.rst
      - action_adapters_reference.rst
      - papers_reference.rst
      - optimizers_reference.rst
      - layers
        string_layers.rst
        preprocessing_layers.rst
        base_class.rst
        neural_network_layers.rst
      - queues_reference.rst
      - component_base.rst
      - distributions_reference.rst
      - loss_functions_reference.rst
      - layers_reference.rst
      - helpers_reference.rst
      - index.rst
      - common_components_reference.rst
      - memories_reference.rst
    - agents
      - index.rst
    - core
      - index.rst
    - index.rst
    - spaces
      - index.rst
    - environments
      - index.rst
  - __init__.py
  - how_to_build_an_algorithm_with_rlgraph.rst
  - index.rst
  - conf.py
  - readme_link.rst
  - how_to_write_our_own_component.rst
  - pypi_instructions.txt
  - _static
    - rlgraph_sphinx.css
  - complete_custom_component.rst
- contrib
  - bitflip_env
    - rlgraph
      - examples
        random_bitflip.py
        configs
        random_bitflip.json
      - tests
        environments
        test_openai_gym_atari.py
      - environments
        __init__.py
        custom
        __init__.py
        openai
        __init__.py
        envs
        bit_flip.py
        __init__.py
    - Bitflip_README.md
  - README.md

# Copyright 2018/2019 The RLgraph authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

from rlgraph import get_backend
from rlgraph.components.loss_functions import LossFunction
from rlgraph.spaces import IntBox
from rlgraph.spaces.space_utils import sanity_check_space
from rlgraph.utils import pytorch_one_hot
from rlgraph.utils.decorators import rlgraph_api, graph_fn
from rlgraph.utils.pytorch_util import pytorch_tile
from rlgraph.utils.util import get_rank

if get_backend() == "tf":
    import tensorflow as tf
elif get_backend() == "pytorch":
    import torch


class DQNLossFunction(LossFunction):
    """
    The classic 2015 DQN Loss Function [1] with options for "double" Q-losses [2], Huber loss [3], and container
    actions [4]:

    L = Expectation-over-uniform-batch(r + gamma x max_a'Qt(s',a') - Qn(s,a))^2
    Where Qn is the "normal" Q-network and Qt is the "target" net (which is a little behind Qn for stability purposes).

    [1] Human-level control through deep reinforcement learning. Mnih, Kavukcuoglu, Silver et al. - 2015
    [2] Deep Reinforcement Learning with Double Q-learning. v. Hasselt, Guez, Silver - 2015
    [3] https://en.wikipedia.org/wiki/Huber_loss
    [4] Action Branching Architectures for Deep Reinforcement Learning. Tavakoli, Pardo, and Kormushev - 2017
    """
    def __init__(self, double_q=False, huber_loss=False, importance_weights=False, n_step=1,
                 shared_container_action_target=True, scope="dqn-loss-function", **kwargs):
        """
        Args:
            double_q (bool): Whether to use the double DQN loss function ([2]).
            huber_loss (bool): Whether to apply a huber loss correction ([3]).
            importance_weights (bool): Where to use importance weights from a prioritized replay.
            n_step (int): n-step adjustment to discounting.

            shared_container_action_target (bool): Whether - only in the case of container actions - the target term
                should be shared (average) over all action components' single loss terms. Default: True.
        """
        self.double_q = double_q
        self.huber_loss = huber_loss
        assert n_step >= 1, "Number of steps for n-step learning must be >= 1, is {}".format(n_step)
        # TODO reward must be preprocessed to work correctly for n-step.
        # For Apex, this is done in the worker - do we want to move this as an in-graph option too?
        self.n_step = n_step
        self.shared_container_action_target = shared_container_action_target

        # Clip value, see: https://en.wikipedia.org/wiki/Huber_loss
        self.huber_delta = kwargs.get("huber_delta", 1.0)
        self.importance_weights = importance_weights

        super(DQNLossFunction, self).__init__(scope=scope, **kwargs)

        self.flat_action_space = None
        self.ranks_to_reduce = 0  # How many ranks do we have to reduce to get down to the final loss per batch item?

    def check_input_spaces(self, input_spaces, action_space=None):
        """
        Do some sanity checking on the incoming Spaces:
        """
        assert action_space is not None
        self.action_space = action_space
        self.flat_action_space = action_space.flatten()
        # Check for IntBox and num_categories.
        sanity_check_space(self.action_space, must_have_categories=True, allowed_sub_types=[IntBox])
        self.ranks_to_reduce = len(self.action_space.get_shape(with_batch_rank=True)) - 1

    @rlgraph_api
    def loss(self, q_values_s, actions, rewards, terminals, qt_values_sp, q_values_sp=None, importance_weights=None):
        loss_per_item = self.loss_per_item(
            q_values_s, actions, rewards, terminals, qt_values_sp, q_values_sp, importance_weights
        )
        total_loss = self.loss_average(loss_per_item)
        return total_loss, loss_per_item

    @rlgraph_api
    def loss_per_item(self, q_values_s, actions, rewards, terminals, qt_values_sp, q_values_sp=None,
                      importance_weights=None):
        # Get the targets per action.
        td_targets = self._graph_fn_get_td_targets(rewards, terminals, qt_values_sp, q_values_sp)
        # Average over container sub-actions.
        if self.shared_container_action_target is True:
            td_targets = self._graph_fn_average_over_container_keys(td_targets)

        # Calculate the loss per item.
        loss_per_item = self._graph_fn_loss_per_item(td_targets, q_values_s, actions, importance_weights)
        # Average over container sub-actions.
        loss_per_item = self._graph_fn_average_over_container_keys(loss_per_item)

        # Apply huber loss.
        loss_per_item = self._graph_fn_apply_huber_loss_if_necessary(loss_per_item)

        return loss_per_item

    @graph_fn(flatten_ops=True, split_ops=True, add_auto_key_as_first_param=True)
    def _graph_fn_get_td_targets(self, key, rewards, terminals, qt_values_sp, q_values_sp=None):
        """
        Args:
            rewards (SingleDataOp): The batch of rewards that we received after having taken a in s (from a memory).
            terminals (SingleDataOp): The batch of terminal signals that we received after having taken a in s
                (from a memory).
            qt_values_sp (SingleDataOp): The batch of Q-values representing the expected accumulated discounted
                returns (estimated by the target net) when in s' and taking different actions a'.
            q_values_sp (Optional[SingleDataOp]): If `self.double_q` is True: The batch of Q-values representing the
                expected accumulated discounted returns (estimated by the (main) policy net) when in s' and taking
                different actions a'.

        Returns:
            SingleDataOp: The target values vector.
        """
        qt_sp_ap_values = None

        # Numpy backend primarily for testing purposes.
        if self.backend == "python" or get_backend() == "python":
            from rlgraph.utils.numpy import one_hot
            if self.double_q:
                a_primes = np.argmax(q_values_sp, axis=-1)
                a_primes_one_hot = one_hot(a_primes, depth=self.flat_action_space[key].num_categories)
                qt_sp_ap_values = np.sum(qt_values_sp * a_primes_one_hot, axis=-1)
            else:
                qt_sp_ap_values = np.max(qt_values_sp, axis=-1)

            for _ in range(qt_sp_ap_values.ndim - 1):
                rewards = np.expand_dims(rewards, axis=1)

            qt_sp_ap_values = np.where(terminals, np.zeros_like(qt_sp_ap_values), qt_sp_ap_values)

        elif get_backend() == "tf":
            # Make sure the target policy's outputs are treated as constant when calculating gradients.
            qt_values_sp = tf.stop_gradient(qt_values_sp)

            if self.double_q:
                # For double-Q, we no longer use the max(a')Qt(s'a') value.
                # Instead, the a' used to get the Qt(s'a') is given by argmax(a') Q(s',a') <- Q=q-net, not target net!
                a_primes = tf.argmax(input=q_values_sp, axis=-1)

                # Now lookup Q(s'a') with the calculated a'.
                one_hot = tf.one_hot(indices=a_primes, depth=self.flat_action_space[key].num_categories)
                qt_sp_ap_values = tf.reduce_sum(input_tensor=(qt_values_sp * one_hot), axis=-1)
            else:
                # Qt(s',a') -> Use the max(a') value (from the target network).
                qt_sp_ap_values = tf.reduce_max(input_tensor=qt_values_sp, axis=-1)

            # Make sure the rewards vector (batch) is broadcast correctly.
            for _ in range(get_rank(qt_sp_ap_values) - 1):
                rewards = tf.expand_dims(rewards, axis=1)

            # Ignore Q(s'a') values if s' is a terminal state. Instead use 0.0 as the state-action value for s'a'.
            # Note that in that case, the next_state (s') is not the correct next state and should be disregarded.
            # See Chapter 3.4 in "RL - An Introduction" (2017 draft) by A. Barto and R. Sutton for a detailed analysis.
            qt_sp_ap_values = tf.where(
                condition=terminals, x=tf.zeros_like(qt_sp_ap_values), y=qt_sp_ap_values
            )

        elif get_backend() == "pytorch":
            if not isinstance(terminals, torch.ByteTensor):
                terminals = terminals.byte()
            # Add batch dim in case of single sample.
            if qt_values_sp.dim() == 1:
                rewards = rewards.unsqueeze(-1)
                terminals = terminals.unsqueeze(-1)
                q_values_sp = q_values_sp.unsqueeze(-1)
                qt_values_sp = qt_values_sp.unsqueeze(-1)

            # Make sure the target policy's outputs are treated as constant when calculating gradients.
            qt_values_sp = qt_values_sp.detach()
            if self.double_q:
                # For double-Q, we no longer use the max(a')Qt(s'a') value.
                # Instead, the a' used to get the Qt(s'a') is given by argmax(a') Q(s',a') <- Q=q-net, not target net!
                a_primes = torch.argmax(q_values_sp, dim=-1, keepdim=True)

                # Now lookup Q(s'a') with the calculated a'.
                one_hot = pytorch_one_hot(a_primes, depth=self.flat_action_space[key].num_categories)
                qt_sp_ap_values = torch.sum(qt_values_sp * one_hot.squeeze(), dim=-1)
            else:
                # Qt(s',a') -> Use the max(a') value (from the target network).
                qt_sp_ap_values = torch.max(qt_values_sp, -1)[0]

            # Make sure the rewards vector (batch) is broadcast correctly.
            for _ in range(get_rank(qt_sp_ap_values) - 1):
                rewards = torch.unsqueeze(rewards, dim=1)

            # Ignore Q(s'a') values if s' is a terminal state. Instead use 0.0 as the state-action value for s'a'.
            # Note that in that case, the next_state (s') is not the correct next state and should be disregarded.
            # See Chapter 3.4 in "RL - An Introduction" (2017 draft) by A. Barto and R. Sutton for a detailed analysis.
            # torch.where cannot broadcast here, so tile and reshape to same shape.
            if qt_sp_ap_values.dim() > 1:
                num_tiles = np.prod(qt_sp_ap_values.shape[1:])
                terminals = pytorch_tile(terminals, num_tiles, -1).reshape(qt_sp_ap_values.shape)
            qt_sp_ap_values = torch.where(
                terminals, torch.zeros_like(qt_sp_ap_values), qt_sp_ap_values
            )
        td_targets = (rewards + (self.discount ** self.n_step) * qt_sp_ap_values)
        return td_targets

    @graph_fn(flatten_ops=True, split_ops=True, add_auto_key_as_first_param=True)
    def _graph_fn_loss_per_item(self, key, td_targets, q_values_s, actions, importance_weights=None):
        """
        Args:
            td_targets (SingleDataOp): The already calculated TD-target terms (r + gamma maxa'Qt(s',a')
                OR for double Q: r + gamma Qt(s',argmaxa'(Q(s',a'))))

            q_values_s (SingleDataOp): The batch of Q-values representing the expected accumulated discounted returns
                when in s and taking different actions a.

            actions (SingleDataOp): The batch of actions that were actually taken in states s (from a memory).

            importance_weights (Optional[SingleDataOp]): If 'self.importance_weights' is True: The batch of weights to
                apply to the losses.

        Returns:
            SingleDataOp: The loss values vector (one single value for each batch item).
        """
        # Numpy backend primarily for testing purposes.
        if self.backend == "python" or get_backend() == "python":
            from rlgraph.utils.numpy import one_hot

            actions_one_hot = one_hot(actions, depth=self.flat_action_space[key].num_categories)
            q_s_a_values = np.sum(q_values_s * actions_one_hot, axis=-1)

            td_delta = td_targets - q_s_a_values

            if td_delta.ndim > 1:
                if self.importance_weights:
                    td_delta = np.mean(
                        td_delta * importance_weights,
                        axis=list(range(1, self.ranks_to_reduce + 1))
                    )

                else:
                    td_delta = np.mean(td_delta, axis=list(range(1, self.ranks_to_reduce + 1)))

        elif get_backend() == "tf":
            # Q(s,a) -> Use the Q-value of the action actually taken before.
            one_hot = tf.one_hot(indices=actions, depth=self.flat_action_space[key].num_categories)
            q_s_a_values = tf.reduce_sum(input_tensor=(q_values_s * one_hot), axis=-1)

            # Calculate the TD-delta (target - current estimate).
            td_delta = td_targets - q_s_a_values

            # Reduce over the composite actions, if any.
            if get_rank(td_delta) > 1:
                td_delta = tf.reduce_mean(input_tensor=td_delta, axis=list(range(1, self.ranks_to_reduce + 1)))

        elif get_backend() == "pytorch":
            # Add batch dim in case of single sample.
            if q_values_s.dim() == 1:
                q_values_s = q_values_s.unsqueeze(-1)
                actions = actions.unsqueeze(-1)
                if self.importance_weights:
                    importance_weights = importance_weights.unsqueeze(-1)

            # Q(s,a) -> Use the Q-value of the action actually taken before.
            one_hot = pytorch_one_hot(actions, depth=self.flat_action_space[key].num_categories)
            q_s_a_values = torch.sum((q_values_s * one_hot), -1)

            # Calculate the TD-delta (target - current estimate).
            td_delta = td_targets - q_s_a_values

            # Reduce over the composite actions, if any.
            if get_rank(td_delta) > 1:
                td_delta = torch.mean(td_delta, tuple(range(1, self.ranks_to_reduce + 1)), keepdim=False)

        # Apply importance-weights from a prioritized replay to the loss.
        if self.importance_weights:
            return importance_weights * td_delta
        else:
            return td_delta

    @graph_fn
    def _graph_fn_apply_huber_loss_if_necessary(self, td_delta):
        if self.backend == "python" or get_backend() == "python":
            if self.huber_loss:
                return np.where(
                    condition=np.abs(td_delta) <= self.huber_delta,
                    x=0.5 * np.square(td_delta),
                    y=self.huber_delta * (np.abs(td_delta) - 0.5 * self.huber_delta)
                )
            else:
                return 0.5 * np.square(x=td_delta)
        elif get_backend() == "tf":
            if self.huber_loss:
                return tf.where(
                    condition=tf.abs(x=td_delta) <= self.huber_delta,
                    x=0.5 * tf.square(x=td_delta),
                    y=self.huber_delta * (tf.abs(x=td_delta) - 0.5 * self.huber_delta)
                )
            else:
                return 0.5 * tf.square(x=td_delta)
        elif get_backend() == "pytorch":
            if self.huber_loss:
                # Not certain if arithmetics need to be expressed via torch operators.
                return torch.where(
                    torch.abs(td_delta) <= self.huber_delta,
                    # PyTorch has no `square`
                    0.5 * torch.pow(td_delta, 2),
                    self.huber_delta * (torch.abs(td_delta) - 0.5 * self.huber_delta)
                )
            else:
                return 0.5 * td_delta * td_delta