Original code from John Schulman for CS294 Deep Reinforcement Learning Spring 2017
Adapted for CS294-112 Fall 2017 by Abhishek Gupta and Joshua Achiam
Adapted for CS294-112 Fall 2018 by Michael Chang and Soroush Nasiriany
Adapted for pytorch version by Ning Dai
import numpy as np
import torch
import gym
import logz
import scipy.signal
import os
import time
import inspect
from torch.multiprocessing import Process
from torch import nn, optim

# Utilities

#                           ----------PROBLEM 2----------
def build_mlp(input_size, output_size, n_layers, hidden_size, activation=nn.Tanh):
        Builds a feedforward neural network
            input_size: size of the input layer
            output_size: size of the output layer
            n_layers: number of hidden layers
            hidden_size: dimension of the hidden layers
            activation: activation of the hidden layers
            output_activation: activation of the output layer

            an instance of nn.Sequential which contains the feedforward neural network

        Hint: use nn.Linear
    layers = []
    raise NotImplementedError
    return nn.Sequential(*layers).apply(weights_init)

def weights_init(m):
    if hasattr(m, 'weight'):

def pathlength(path):
    return len(path["reward"])

def setup_logger(logdir, locals_):
    # Configure output directory for logging
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    hyperparams = {k: locals_[k] if k in locals_ else None for k in args}

class PolicyNet(nn.Module):
    def __init__(self, neural_network_args):
        super(PolicyNet, self).__init__()
        self.ob_dim = neural_network_args['ob_dim']
        self.ac_dim = neural_network_args['ac_dim']
        self.discrete = neural_network_args['discrete']
        self.hidden_size = neural_network_args['size']
        self.n_layers = neural_network_args['n_layers']

    #                           ----------PROBLEM 2----------
    def define_model_components(self):
            Define the parameters of policy network here.
            You can use any instance of nn.Module or nn.Parameter.

            Hint: use the 'build_mlp' function defined above
                In the discrete case, model should output logits of a categorical distribution
                    over the actions
                In the continuous case, model should output a tuple (mean, log_std) of a Gaussian
                    distribution over actions. log_std should just be a trainable
                    variable, not a network output.
        # YOUR_CODE_HERE
        if self.discrete:
            raise NotImplementedError
            raise NotImplementedError
    #                           ----------PROBLEM 2----------
        Notes on notation:
        Pytorch tensor variables have the prefix ts_, to distinguish them from the numpy array
        variables that are computed later in the function
        Prefixes and suffixes:
        ob - observation 
        ac - action
        _no - this tensor should have shape (batch size, observation dim)
        _na - this tensor should have shape (batch size, action dim)
        _n  - this tensor should have shape (batch size)
        Note: batch size is defined at runtime
    def forward(self, ts_ob_no):
            Define forward pass for policy network.

                ts_ob_no: (batch_size, self.ob_dim) 

                the parameters of the policy.

                if discrete, the parameters are the logits of a categorical distribution
                    over the actions
                    ts_logits_na: (batch_size, self.ac_dim)

                if continuous, the parameters are a tuple (mean, log_std) of a Gaussian
                    distribution over actions. log_std should just be a trainable
                    variable, not a network output.
                    ts_mean: (batch_size, self.ac_dim)
                    st_logstd: (self.ac_dim,)
            Hint: use the components you defined in self.define_model_components
        raise NotImplementedError
        if self.discrete:
            # YOUR_CODE_HERE
            ts_logits_na = None
            return ts_logits_na
            # YOUR_CODE_HERE
            ts_mean = None
            ts_logstd = None
            return (ts_mean, ts_logstd)
# Policy Gradient

class Agent(object):
    def __init__(self, neural_network_args, sample_trajectory_args, estimate_return_args):
        super(Agent, self).__init__()
        self.ob_dim = neural_network_args['ob_dim']
        self.ac_dim = neural_network_args['ac_dim']
        self.discrete = neural_network_args['discrete']
        self.hidden_size = neural_network_args['size']
        self.n_layers = neural_network_args['n_layers']
        self.learning_rate = neural_network_args['learning_rate']

        self.animate = sample_trajectory_args['animate']
        self.max_path_length = sample_trajectory_args['max_path_length']
        self.min_timesteps_per_batch = sample_trajectory_args['min_timesteps_per_batch']

        self.gamma = estimate_return_args['gamma']
        self.reward_to_go = estimate_return_args['reward_to_go']
        self.nn_baseline = estimate_return_args['nn_baseline']
        self.normalize_advantages = estimate_return_args['normalize_advantages']

        self.policy_net = PolicyNet(neural_network_args)
        params = list(self.policy_net.parameters())

        #                           ----------PROBLEM 6----------
        # Optional Baseline
        # Define a neural network baseline.
        if self.nn_baseline:
            self.value_net = build_mlp(self.ob_dim, 1, self.n_layers, self.hidden_size)
            params += list(self.value_net.parameters())

        self.optimizer = optim.Adam(params, lr=self.learning_rate)
    #                           ----------PROBLEM 2----------
    def sample_action(self, ob_no):
            Build the method used for sampling action from the policy distribution
                ob_no: (batch_size, self.ob_dim)

                    if discrete: (batch_size)
                    if continuous: (batch_size, self.ac_dim)

            Hint: for the continuous case, use the reparameterization trick:
                 The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
                      mu + sigma * z,         z ~ N(0, I)
                 This reduces the problem to just sampling z. (Hint: use torch.normal!)
        ts_ob_no = torch.from_numpy(ob_no).float()
        raise NotImplementedError
        if self.discrete:
            ts_logits_na = self.policy_net(ts_ob_no)
            # YOUR_CODE_HERE
            ts_sampled_ac = None
            ts_mean, ts_logstd = self.policy_net(ts_ob_no)
            # YOUR_CODE_HERE
            ts_sampled_ac = None

        sampled_ac = ts_sampled_ac.numpy()
        return sampled_ac

    #                           ----------PROBLEM 2----------
    def get_log_prob(self, policy_parameters, ts_ac_na):
            Build the method used for computing the log probability of a set of actions
            that were actually taken according to the policy

                    if discrete: logits of a categorical distribution over actions 
                        ts_logits_na: (batch_size, self.ac_dim)
                    if continuous: (mean, log_std) of a Gaussian distribution over actions
                        ts_mean: (batch_size, self.ac_dim)
                        ts_logstd: (self.ac_dim,)

                ts_ac_na: (batch_size, self.ac_dim)

                ts_logprob_n: (batch_size)

                For the discrete case, use the log probability under a categorical distribution.
                For the continuous case, use the log probability under a multivariate gaussian.
        raise NotImplementedError
        if self.discrete:
            ts_logits_na = policy_parameters
            # YOUR_CODE_HERE
            ts_logprob_n = None
            ts_mean, ts_logstd = policy_parameters
            # YOUR_CODE_HERE
            ts_logprob_n = None
        return ts_logprob_n

    def sample_trajectories(self, itr, env):
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and self.animate)
            path = self.sample_trajectory(env, animate_this_episode)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > self.min_timesteps_per_batch:
        return paths, timesteps_this_batch

    def sample_trajectory(self, env, animate_this_episode):
        ob = env.reset()
        obs, acs, rewards = [], [], []
        steps = 0
        while True:
            if animate_this_episode:
            #                           ----------PROBLEM 3----------
            raise NotImplementedError
            ac = None # YOUR CODE HERE
            ac = ac[0]
            ob, rew, done, _ = env.step(ac)
            steps += 1
            if done or steps > self.max_path_length:
        path = {"observation" : np.array(obs, dtype=np.float32), 
                "reward" : np.array(rewards, dtype=np.float32), 
                "action" : np.array(acs, dtype=np.float32)}
        return path

    #                           ----------PROBLEM 3----------
    def sum_of_rewards(self, re_n):
            Monte Carlo estimation of the Q function.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from 
            let num_paths be the number of paths sampled from Agent.sample_trajectories

                re_n: length: num_paths. Each element in re_n is a numpy array 
                    containing the rewards for the particular path

                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values 
                    whose length is the sum of the lengths of the paths

            Your code should construct numpy arrays for Q-values which will be used to compute
            advantages (which will in turn be fed to the placeholder you defined in 
            Recall that the expression for the policy gradient PG is
                  PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
                  tau=(s_0, a_0, ...) is a trajectory,
                  Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
                  and b_t is a baseline which may depend on s_t. 
            You will write code for two cases, controlled by the flag 'reward_to_go':
              Case 1: trajectory-based PG 
                  (reward_to_go = False)
                  Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
                  entire trajectory (regardless of which time step the Q-value should be for). 
                  For this case, the policy gradient estimator is
                      E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
                      Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
                  Thus, you should compute
                      Q_t = Ret(tau)
              Case 2: reward-to-go PG 
                  (reward_to_go = True)
                  Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
                  from time step t. Thus, you should compute
                      Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
            like the 'ob_no' and 'ac_na' above. 
        # YOUR_CODE_HERE
        if self.reward_to_go:
            raise NotImplementedError
            raise NotImplementedError
        return q_n

    def compute_advantage(self, ob_no, q_n):
            Computes advantages by (possibly) subtracting a baseline from the estimated Q values

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from 
            let num_paths be the number of paths sampled from Agent.sample_trajectories

                ob_no: shape: (sum_of_path_lengths, ob_dim)
                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values 
                    whose length is the sum of the lengths of the paths

                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated 
                    advantages whose length is the sum of the lengths of the paths
        #                           ----------PROBLEM 6----------
        # Computing Baselines
        if self.nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current batch of Q-values. (Goes with Hint
            # #bl2 in Agent.update_parameters.
            raise NotImplementedError
            # YOUR CODE HERE
            b_n = None 
            adv_n = q_n - b_n
            adv_n = q_n.copy()
        return adv_n

    def estimate_return(self, ob_no, re_n):
            Estimates the returns over a set of trajectories.

            let sum_of_path_lengths be the sum of the lengths of the paths sampled from 
            let num_paths be the number of paths sampled from Agent.sample_trajectories

                ob_no: shape: (sum_of_path_lengths, ob_dim)
                re_n: length: num_paths. Each element in re_n is a numpy array 
                    containing the rewards for the particular path

                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values 
                    whose length is the sum of the lengths of the paths
                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated 
                    advantages whose length is the sum of the lengths of the paths
        q_n = self.sum_of_rewards(re_n)
        adv_n = self.compute_advantage(ob_no, q_n)
        #                           ----------PROBLEM 3----------
        # Advantage Normalization
        if self.normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            raise NotImplementedError
            adv_n = None # YOUR_CODE_HERE
        return q_n, adv_n

    def update_parameters(self, ob_no, ac_na, q_n, adv_n):
            Update the parameters of the policy and (possibly) the neural network baseline, 
            which is trained to approximate the value function.

                ob_no: shape: (sum_of_path_lengths, ob_dim)
                ac_na: shape: (sum_of_path_lengths).
                q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values 
                    whose length is the sum of the lengths of the paths
                adv_n: shape: (sum_of_path_lengths). A single vector for the estimated 
                    advantages whose length is the sum of the lengths of the paths


        # convert numpy array to pytorch tensor
        ts_ob_no, ts_ac_na, ts_q_n, ts_adv_n = map(lambda x: torch.from_numpy(x), [ob_no, ac_na, q_n, adv_n])

        # The policy takes in an observation and produces a distribution over the action space
        policy_parameters = self.policy_net(ts_ob_no)

        # We can compute the logprob of the actions that were actually taken by the policy
        # This is used in the loss function.
        ts_logprob_n = self.get_log_prob(policy_parameters, ts_ac_na)

        # clean the gradient for model parameters
        #                           ----------PROBLEM 3----------
        # Loss Function for Policy Gradient
        raise NotImplementedError
        loss = None # YOUR CODE HERE
        #                           ----------PROBLEM 6----------
        # Optimizing Neural Network Baseline
        if self.nn_baseline:
            # If a neural network baseline is used, set up the targets and the output of the 
            # baseline. 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # self.value_net you defined earlier.
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 in 
            # Agent.compute_advantage.)

            # YOUR_CODE_HERE
            raise NotImplementedError
            baseline_prediction = None
            ts_target_n = None
            baseline_loss = None

        #                           ----------PROBLEM 3----------
        # Performing the Policy Update

        # Call the optimizer to perform the policy gradient update based on the current batch 
        # of rollouts.
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE
        raise NotImplementedError

def train_PG(

    start = time.time()

    # Set Up Logger
    setup_logger(logdir, locals())

    # Set Up Env

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Initialize Agent
    neural_network_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,

    agent = Agent(neural_network_args, sample_trajectory_args, estimate_return_args)

    # Training Loop

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        with torch.no_grad(): # use torch.no_grad to disable the gradient calculation
            paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]

        with torch.no_grad():
            q_n, adv_n = agent.estimate_return(ob_no, re_n)
        agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)

def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str)
    parser.add_argument('--exp_name', type=str, default='vpg')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--discount', type=float, default=1.0)
    parser.add_argument('--n_iter', '-n', type=int, default=100)
    parser.add_argument('--batch_size', '-b', type=int, default=1000)
    parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
    parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
    parser.add_argument('--reward_to_go', '-rtg', action='store_true')
    parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
    parser.add_argument('--nn_baseline', '-bl', action='store_true')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--n_experiments', '-e', type=int, default=1)
    parser.add_argument('--n_layers', '-l', type=int, default=2)
    parser.add_argument('--size', '-s', type=int, default=64)
    args = parser.parse_args()

    if not(os.path.exists('data')):
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):

    max_path_length = args.ep_len if args.ep_len > 0 else None

    processes = []

    for e in range(args.n_experiments):
        seed = args.seed + 10*e
        print('Running experiment with seed %d'%seed)

        def train_func():
        p = Process(target=train_func, args=tuple())
        # if you comment in the line below, then the loop will block 
        # until this process finishes
        # p.join()

    for p in processes:

if __name__ == "__main__":