python source code of utils

"""Utility and helper functions for the RL simulators."""

import matplotlib.pylab as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

###############################
# Define policies for RL agents
###############################


class Policy(nn.Module):
    """Base class for policies."""

    def __init__(self, env):
        super(Policy, self).__init__()
        self.ac_dim = env.action_space.n
        self.obs_dim = env.observation_space.shape[0]

    def sample_action(self, obs):
        """Sample an action for the given observation.

        Parameters
        ----------
        obs: A numpy array of shape [obs_dim].

        Returns 
        -------
        An integer, the action sampled.
        """
        raise NotImplementedError


class NNPolicy(Policy):
    """A neural network policy with one hidden layer."""

    def __init__(self, env, n_layers=1, hidden_dim=8, activation=nn.ReLU):
        super(NNPolicy, self).__init__(env)
        layers = []
        in_dim = self.obs_dim
        for _ in range(n_layers):
            layers.append(nn.BatchNorm1d(in_dim, affine=False))
            layers.append(nn.Linear(in_dim, hidden_dim))
            layers.append(activation())
            in_dim = hidden_dim
        layers.append(nn.Linear(in_dim, self.ac_dim))
        self.seq = nn.Sequential(*layers)
        # Start with completely random action.
        self.epsilon = 1.0

    def forward(self, obs):
        """Compute action logits from observation.
        
        Parameters
        ----------
        obs: A Tensor of shape [batch_size, obs_dim].
        
        Returns
        -------
        A Tensor of shape [batch_size, ac_dim].
        """
        return self.seq.double()((obs.double()))

    def log_prob(self, obs, action):
        """Compute the log probability of an action under the given observation.
        
        Parameters
        ----------
        obs: A Tensor of shape [batch_size, obs_dim].
        action: A Tensor of shape [batch_size, 1].
        
        Returns
        -------
        A Tensor of shape [batch_size, 1], the log probabilities of the actions.
        """
        log_probs = nn.functional.log_softmax(self.forward(obs), dim=1)[
            :,
        ]
        action_one_hot = nn.functional.one_hot(action, num_classes=self.ac_dim)
        return torch.sum(log_probs * action_one_hot, dim=1)

    def sample_action(self, obs):
        """Sample an action for the given observation.
        
        Parameters
        ----------
        obs: A numpy array of shape [obs_dim].
        
        Returns
        -------
        An integer, the action sampled.
        """
        if np.random.random() < self.epsilon:
            return np.random.randint(self.ac_dim)
        if len(obs.shape) != 1 or obs.shape[0] != self.obs_dim:
            raise ValueError(
                "Expected input observation shape [obs_dim], got %s" % str(obs.shape)
            )
        obs = torch.tensor(obs.reshape(1, -1), dtype=torch.float64)
        # When sampling use eval mode.
        return (
            torch.distributions.Categorical(logits=self.eval().double().forward(obs))
            .sample()
            .item()
        )


class PolicyGradientAgent:
    """An agent that learns a neural network policy through policy gradient."""

    def __init__(self, env, gamma=0.99, learning_rate=1e-3):
        # Discount factor gamma.
        self.gamma = gamma
        self.actor = NNPolicy(env)
        self.optimizer = optim.Adam(self.actor.parameters(), lr=learning_rate)

    def train(self, trajectories):
        """Update the policy according to policy gradients.
        
        Parameters
        ----------
        trajectories: A list of dictionaries. Each dictionary has keys `observation`,
        `action`, `reward`, `next_observation`, `terminal`, each mapping to a numpy array
        of shape [num_steps, ?].
        
        Returns
        -------
        A scalar, training loss.
        """
        obs = np.concatenate([tau["observation"] for tau in trajectories], axis=0)
        acs = np.concatenate([tau["action"] for tau in trajectories], axis=0)
        actions_log_prob = self.actor.log_prob(
            torch.tensor(obs, dtype=torch.float64), torch.tensor(acs, dtype=torch.int64)
        )

        reward_to_go = np.concatenate(
            [self._reward_to_go(tau["reward"]) for tau in trajectories]
        )
        advantage = (reward_to_go - np.mean(reward_to_go)) / (
            np.std(reward_to_go) + 1e-8
        )
        if reward_to_go.shape[0] != acs.shape[0]:
            raise ValueError(
                "Array dimension mismatch, expected same number of rewards and "
                "actions. Observed %d rewards, %d actions."
                % (reward_to_go.shape[0], acs.shape[0])
            )

        loss = -torch.mean(
            actions_log_prob * torch.tensor(advantage, dtype=torch.float64)
        )
        # Update network weights
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss

    def _reward_to_go(self, rewards):
        """Compute discounted reward to go.
        
        Parameters
        ----------
        rewards: A list of rewards {r_0, r_1, ..., r_t', ... r_{T-1}} from a single
        rollout of length T.
        
        Returns
        -------
        A numpy array where the entry at index t is sum_{t'=t}^{T-1} gamma^(t'-t) * r_{t'}.
        """
        all_discounted_cumsums = []
        # for loop over steps (t) of the given rollout
        for start_time_index in range(len(rewards)):
            indices = np.arange(start_time_index, len(rewards))
            discounts = self.gamma ** (indices - start_time_index)
            all_discounted_cumsums.append(sum(discounts * rewards[start_time_index:]))
        return np.array(all_discounted_cumsums)


###################################################
# Utilities for sampling trajectories and training.
###################################################


def run_training_loop(
    env, n_iter=200, max_episode_length=100, batch_size=512, learning_rate=1e-2
):
    """Trains a neural network policy using policy gradients.

    Parameters
    ----------
    n_iter: number of training iterations
    max_episode_length: episode length, up to 400
    batch_size: number of steps used in each iteration
    learning_rate: learning rate for the Adam optimizer

    Returns
    -------
    A Policy instance, the trained policy.
    """
    total_timesteps = 0
    agent = PolicyGradientAgent(env=env, learning_rate=learning_rate)
    avg_rewards = np.zeros(n_iter)
    avg_episode_lengths = np.zeros(n_iter)
    loss = np.zeros(n_iter)
    for itr in range(n_iter):
        if itr % 10 == 0:
            print(f"*****Iteration {itr}*****")
        trajectories, timesteps_this_itr = sample_trajectories_by_batch_size(
            env, agent.actor, batch_size, max_episode_length
        )
        total_timesteps += timesteps_this_itr
        avg_rewards[itr] = np.mean(
            [get_trajectory_total_reward(tau) for tau in trajectories]
        )
        avg_episode_lengths[itr] = np.mean(
            [get_trajectory_len(tau) for tau in trajectories]
        )
        loss[itr] = agent.train(trajectories).item()
        # Update rule for epsilon s.t. after 100 iterations it's around 0.05.
        agent.actor.epsilon = np.maximum(0.05, agent.actor.epsilon * 0.97)

    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=[9, 9])
    ax1.plot(avg_rewards)
    ax1.set_xlabel("number of iterations")
    ax1.set_ylabel("average total reward")
    ax1.set_ylim(avg_rewards.min(), avg_rewards.max())
    ax2.plot(loss)
    ax2.set_xlabel("number of iterations")
    ax2.set_ylabel("training loss")
    ax2.set_ylim(loss.min(), loss.max())
    ax3.plot(avg_episode_lengths)
    ax3.set_xlabel("number of iterations")
    ax3.set_ylabel("average episode length")
    ax3.set_ylim(avg_episode_lengths.min(), avg_episode_lengths.max())
    plt.show()

    agent.actor.epsilon = 0.0
    return agent.actor


def sample_n_trajectories(env, policy, n_trajectories, max_episode_length):
    """Sample n trajectories using the given policy.
    
    Parameters
    ----------
    env: A WhyNot gym environment.
    policy: An instance of Policy.
    n_trajectories: Number of trajectories.
    max_episode_length: Cap on max length for each episode.
    
    Returns
    -------
    A list of n_trajectories dictionaries, each dictionary maps keys `observation`, `action`,
    `reward`, `next_observation`, `terminal` to numpy arrays of size episode length.
    """
    trajectories = []
    for i in range(n_trajectories):
        # collect rollout
        trajectory = sample_trajectory(env, policy, max_episode_length)
        trajectories.append(trajectory)
    return trajectories


def sample_trajectories_by_batch_size(
    env, policy, min_timesteps_per_batch, max_episode_length
):
    """Sample multiple trajectories using the given policy to achieve total number of steps.
    
    Parameters
    ----------
    env: A WhyNot gym environment.
    policy: An instance of Policy.
    min_timesteps_per_batch: Desired number of timesteps in all trajectories combined.
    max_episode_length: Cap on max length for each episode.
    
    Returns
    -------
    A list of n dictionaries, each dictionary maps keys `observation`, `action`,
    `reward`, `next_observation`, `terminal` to numpy arrays of size episode length.
    """
    timesteps_this_batch = 0
    trajectories = []
    while timesteps_this_batch < min_timesteps_per_batch:
        trajectory = sample_trajectory(env, policy, max_episode_length)
        trajectories.append(trajectory)
        timesteps_this_batch += get_trajectory_len(trajectory)
    return trajectories, timesteps_this_batch


def sample_trajectory(env, policy, max_episode_length):
    """Sample one trajectories using the given policy.
    
    Parameters
    ----------
    env: A WhyNot gym environment.
    policy: An instance of Policy.
    max_episode_length: Cap on max length for each episode.
    
    Returns
    -------
    A  dictionary, each dictionary maps keys `observation`, `action`, `reward`,
    `next_observation`, `terminal` to numpy arrays of size episode length.
    """
    # initialize env for the beginning of a new rollout
    ob = env.reset()
    obs, acs, rewards, next_obs, terminals = [], [], [], [], []
    steps = 0
    while True:
        # use the most recent ob to decide what to do
        obs.append(ob)
        ac = policy.sample_action(ob)
        acs.append(ac)
        # take that action and record results
        ob, rew, done, _ = env.step(ac)
        # record result of taking that action
        steps += 1
        next_obs.append(ob)
        rewards.append(rew)
        # End the rollout if the rollout ended
        # Note that the rollout can end due to done, or due to max_episode_length
        if done or steps > max_episode_length:
            rollout_done = 1
        else:
            rollout_done = 0
        terminals.append(rollout_done)
        if rollout_done:
            break
    return wrap_trajectory(obs, acs, rewards, next_obs, terminals)


def wrap_trajectory(obs, acs, rewards, next_obs, terminals):
    """Encapsulate a full trajectory in a dictionary."""
    return {
        "observation": np.array(obs, dtype=np.float32),
        "reward": np.array(rewards, dtype=np.float32),
        "action": np.array(acs, dtype=np.float32),
        "next_observation": np.array(next_obs, dtype=np.float32),
        "terminal": np.array(terminals, dtype=np.float32),
    }


def get_trajectory_len(trajectory):
    """Get the total number of actions in the trajectory."""
    return len(trajectory["action"])


def get_trajectory_total_reward(trajectory):
    """Get the accumulated reward for the trajectory."""
    return np.sum(trajectory["reward"])


####################
# Plotting utilities
####################
def plot_sample_trajectory(env, policies, max_episode_length, state_names):
    """Plot sample trajectories from policies.
    
    Parameters
    ----------
        policies: A dictionary mapping policy names to policies.
        max_episode_length: Max number of steps in the trajectory.
    """
    fig, axes = plt.subplots(5, 2, sharex=True, figsize=[12, 12])
    axes = axes.flatten()

    for name, policy in policies.items():
        trajectory = sample_trajectory(env, policy, max_episode_length)
        obs = trajectory["observation"]
        # Plot state evolution
        for i in range(len(state_names)):
            y = np.log(obs[:, i])
            axes[i].plot(y, label=name)
            axes[i].set_ylabel("log " + state_names[i])
            ymin, ymax = axes[i].get_ylim()
            axes[i].set_ylim(np.minimum(ymin, y.min()), np.maximum(ymax, y.max()))

        # Plot actions
        action = np.array(trajectory["action"])
        epsilon_1 = np.logical_or((action == 2), (action == 3)).astype(float) * 0.7
        epsilon_2 = np.logical_or((action == 1), (action == 3)).astype(float) * 0.3
        axes[-3].plot(epsilon_1, label=name)
        axes[-3].set_ylabel("Treatment epsilon_1")
        axes[-3].set_ylim(-0.1, 1.0)
        axes[-2].plot(epsilon_2, label=name)
        axes[-2].set_ylabel("Treatment epsilon_2")
        axes[-2].set_ylim(-0.1, 1.0)

        # Plot reward
        reward = trajectory["reward"]
        axes[-1].plot(reward, label=name)
        axes[-1].set_ylabel("reward")
        axes[-1].ticklabel_format(scilimits=(-2, 2))
        ymin, ymax = axes[-1].get_ylim()
        axes[-1].set_ylim(
            np.minimum(ymin, reward.min()), np.maximum(ymax, reward.max())
        )

        print(f"Total reward for {name}: {np.sum(reward):.2f}")

    for ax in axes:
        ax.legend()
        ax.set_xlabel("time (days)")
    plt.show()