python source code of dqn

# Deep Q-learning agent with q-value approximation
# Following paper: Playing Atari with Deep Reinforcement Learning
#     https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf
#
# ---
# @author Yiren Lu
# @email luyiren [at] seas [dot] upenn [dot] edu
#
# MIT License


import gym
import numpy as np
import random
import tensorflow as tf
import tf_utils


class DQNAgent():
  """
  DQN Agent with a 2-hidden-layer fully-connected q-network that acts epsilon-greedily.
  """

  def __init__(self,
    session,
    epsilon=0.5, 
    epsilon_anneal = 0.01,
    end_epsilon=0.1,
    lr=0.5, 
    gamma=0.99,
    state_size=4,
    action_size=2,
    scope="dqn",
    n_hidden_1=20,
    n_hidden_2=20,
    ):
    """
    args
      epsilon           exploration rate
      epsilon_anneal    linear decay rate per call of epsilon_decay() function
      end_epsilon       lowest exploration rate
      lr                learning rate
      gamma             discount factor
      state_size        network input size
      action_size       network output size
    """
    self.epsilon = epsilon
    self.epsilon_anneal = epsilon_anneal
    self.end_epsilon = end_epsilon
    self.lr = lr
    self.gamma = gamma
    self.state_size = state_size
    self.action_size = action_size
    self.scope = scope
    self.n_hidden_1 = n_hidden_1
    self.n_hidden_2 = n_hidden_2
    self._build_qnet()
    self.sess = session

  def _build_qnet(self):
    """
    Build q-network
    """
    with tf.variable_scope(self.scope):
      self.state_input = tf.placeholder(tf.float32, [None, self.state_size])
      self.action = tf.placeholder(tf.int32, [None])
      self.target_q = tf.placeholder(tf.float32, [None])

      fc1 = tf_utils.fc(self.state_input, n_output=self.n_hidden_1, activation_fn=tf.nn.relu)
      fc2 = tf_utils.fc(fc1, n_output=self.n_hidden_2, activation_fn=tf.nn.relu)
      self.q_values = tf_utils.fc(fc2, self.action_size, activation_fn=None)

      action_mask = tf.one_hot(self.action, self.action_size, 1.0, 0.0)
      q_value_pred = tf.reduce_sum(self.q_values * action_mask, 1)

      self.loss = tf.reduce_mean(tf.square(tf.subtract(self.target_q, q_value_pred)))
      self.optimizer = tf.train.AdamOptimizer(self.lr)
      self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

  def get_action_values(self, state):
    actions = self.sess.run(self.q_values, feed_dict={self.state_input: [state]})
    return actions

  def get_optimal_action(self, state):
    actions = self.sess.run(self.q_values, feed_dict={self.state_input: [state]})
    return actions.argmax()

  def get_action(self, state):
    """
    Epsilon-greedy action

    args
      state           current state      
    returns
      an action to take given the state
    """
    if np.random.random() < self.epsilon:
      # act randomly
      return np.random.randint(0, self.action_size)
    else:
      return self.get_optimal_action(state)

  def epsilon_decay(self):    
    if self.epsilon > self.end_epsilon:
      self.epsilon = self.epsilon - self.epsilon_anneal

  def learn_epoch(self, exprep, num_steps):
    """
    Deep Q-learing: train qnetwork for num_steps, for each step, sample a batch from exprep

    Args
      exprep:         experience replay
      num_steps:      num of steps
    """
    for i in xrange(num_steps):
      self.learn_batch(exprep.sample())

  def learn_batch(self, batch_steps):
    """
    Deep Q-learing: train qnetwork with the input batch
    Args
      batch_steps:    a batch of sampled namedtuple Step, where Step.cur_step and 
                      Step.next_step are of shape {self.state_size}
      sess:           tf session
    Returns 
      batch loss (-1 if input is empty)
    """
    if len(batch_steps) == 0:
      return -1

    next_state_batch = [s.next_step for s in batch_steps]
    q_values = self.sess.run(self.q_values, feed_dict={self.state_input: next_state_batch})

    max_q_values = q_values.max(axis=1)
    # compute target q value
    target_q = np.array([s.reward + self.gamma*max_q_values[i]*(1-s.done) for i,s in enumerate(batch_steps)])
    target_q = target_q.reshape([len(batch_steps)])
    
    # minimize the TD-error
    cur_state_batch = [s.cur_step for s in batch_steps]
    actions = [s.action for s in batch_steps]
    l, _, = self.sess.run([self.loss, self.train_op], feed_dict={ self.state_input: cur_state_batch,
                                                                  self.target_q: target_q,
                                                                  self.action: actions })
    return l