python source code of agent

import numpy as np
import tensorflow as tf

from config import args, tf_config
from distributions import make_pd_type
import utils as U


class Policy:

    def __init__(self, ob_space, ac_space, batch, n_steps, reuse):
        ob_dim = (batch,) + ob_space.shape
        act_dim = ac_space.shape[0]
        self.ph_obs = tf.placeholder(tf.float32, ob_dim, name='ph_obs')

        with tf.variable_scope('policy', reuse=reuse):
            h1 = U.fc(self.ph_obs, out_dim=64, activation_fn=tf.nn.tanh,
                    init_scale=np.sqrt(2), scope='pi_fc1')
            h2 = U.fc(h1, out_dim=64, activation_fn=tf.nn.tanh,
                    init_scale=np.sqrt(2), scope='pi_fc2')
            pi = U.fc(h2, out_dim=act_dim, activation_fn=None, init_scale=0.01,
                    scope='pi')
            h1 = U.fc(self.ph_obs, out_dim=64, activation_fn=tf.nn.tanh,
                    init_scale=np.sqrt(2), scope='vf_fc1')
            h2 = U.fc(h1, out_dim=64, activation_fn=tf.nn.tanh,
                    init_scale=np.sqrt(2), scope='vf_fc2')
            vf = U.fc(h2, out_dim=1, activation_fn=None, scope='vf')[:, 0]
            logstd = tf.get_variable(name='logstd', shape=[1, act_dim],
                                     initializer=tf.zeros_initializer())
            # concatenate probabilities and logstds
            pd_params = tf.concat([pi, pi * 0.0 + logstd], axis=1)
            self.pd_type = make_pd_type(ac_space)
            self.pd = self.pd_type.pdfromflat(pd_params)

            self.a_out = self.pd.sample()
            self.neglogp = self.pd.get_neglogp(self.a_out)

            self.v_out = vf
            self.pi = pi


class PPO:

    def __init__(self, env):
        self.sess = tf.Session(config=tf_config)
        ob_space = env.observation_space
        ac_space = env.action_space
        self.act_policy = Policy(ob_space, ac_space, env.num_envs,
                                 n_steps=1, reuse=False)
        self.train_policy = Policy(ob_space, ac_space, args.minibatch,
                                   n_steps=args.batch_steps, reuse=True)
        self._build_train()
        self.sess.run(tf.global_variables_initializer())

    def _build_train(self):
        # build placeholders
        self.ph_obs_train = self.train_policy.ph_obs
        self.ph_a = self.train_policy.pd_type.get_action_placeholder([None])
        self.ph_adv = tf.placeholder(tf.float32, [None])
        self.ph_r = tf.placeholder(tf.float32, [None])
        self.ph_old_neglogp = tf.placeholder(tf.float32, [None])
        self.ph_old_v = tf.placeholder(tf.float32, [None])
        self.ph_lr = tf.placeholder(tf.float32, [])
        self.ph_clip_range = tf.placeholder(tf.float32, [])

        # build losses
        self.neglogp = self.train_policy.pd.get_neglogp(self.ph_a)
        self.entropy = tf.reduce_mean(self.train_policy.pd.get_entropy())
        v = self.train_policy.v_out
        v_clipped = self.ph_old_v + tf.clip_by_value(
            v - self.ph_old_v, -self.ph_clip_range, self.ph_clip_range)
        v_loss1 = tf.square(v - self.ph_r)
        v_loss2 = tf.square(v_clipped - self.ph_r)
        self.v_loss = 0.5 * tf.reduce_mean(tf.maximum(v_loss1, v_loss2))

        # ratio = tf.exp(self.ph_old_neglogp - self.neglogp)
        old_p = tf.exp(-self.ph_old_neglogp)
        new_p = tf.exp(-self.neglogp)
        ratio = new_p / old_p
        pg_loss1 = -self.ph_adv * ratio
        pg_loss2 = -self.ph_adv * tf.clip_by_value(
            ratio, 1.0 - self.ph_clip_range, 1.0 + self.ph_clip_range)
        self.pg_loss = tf.reduce_mean(tf.maximum(pg_loss1, pg_loss2))
        loss = self.pg_loss + args.v_coef * self.v_loss - \
            args.entropy_coef * self.entropy

        # build train operation
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy')
        grads = tf.gradients(loss, params)
        if args.max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(
                grads, args.max_grad_norm)
        grads = list(zip(grads, params))
        self.train_op = tf.train.AdamOptimizer(
            learning_rate=self.ph_lr, epsilon=1e-5).apply_gradients(grads)

        # train info
        self.approxkl = 0.5 * tf.reduce_mean(
            tf.square(self.neglogp - self.ph_old_neglogp))
        self.clip_frac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.ph_clip_range)))
        self.avg_ratio = tf.reduce_mean(ratio)

    def step(self, obs, *_args, **_kwargs):
        feed_dict = {self.act_policy.ph_obs: obs}
        a, v, neglogp = self.sess.run(
            [self.act_policy.a_out,
             self.act_policy.v_out,
             self.act_policy.neglogp],
            feed_dict=feed_dict)
        return a, v, neglogp

    def get_value(self, obs, *_args, **_kwargs):
        feed_dict = {self.act_policy.ph_obs: obs}
        return self.sess.run(self.act_policy.v_out, feed_dict=feed_dict)

    def train(self, lr, clip_range, obs, returns, masks, actions, values, neglogps,
              advs):
        # advs = returns - values
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        feed_dict = {self.ph_obs_train: obs, self.ph_a: actions,
                     self.ph_adv: advs, self.ph_r: returns,
                     self.ph_old_neglogp: neglogps, self.ph_old_v: values,
                     self.ph_lr: lr,
                     self.ph_clip_range: clip_range}
        self.loss_names = ['loss_policy', 'loss_value', 'avg_ratio', 'policy_entropy',
                           'approxkl', 'clipfrac']
        return self.sess.run(
            [self.pg_loss, self.v_loss, self.avg_ratio, self.entropy,
             self.approxkl, self.clip_frac, self.train_op],
            feed_dict=feed_dict)[:-1]