```"""This file includes a collection of utility functions that are useful for
implementing DQN."""
import gym
import tensorflow as tf
import numpy as np
import random

def huber_loss(x, delta=1.0):
# https://en.wikipedia.org/wiki/Huber_loss
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5,
delta * (tf.abs(x) - 0.5 * delta)
)

def sample_n_unique(sampling_f, n):
"""Helper function. Given a function `sampling_f` that returns
comparable objects, sample n such unique objects.
"""
res = []
while len(res) < n:
candidate = sampling_f()
if candidate not in res:
res.append(candidate)
return res

class Schedule(object):
def value(self, t):
"""Value of the schedule at time t"""
raise NotImplementedError()

class ConstantSchedule(object):
def __init__(self, value):
"""Value remains constant over time.
Parameters
----------
value: float
Constant value of the schedule
"""
self._v = value

def value(self, t):
"""See Schedule.value"""
return self._v

def linear_interpolation(l, r, alpha):
return l + alpha * (r - l)

class PiecewiseSchedule(object):
def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
"""Piecewise schedule.
endpoints: [(int, int)]
list of pairs `(time, value)` meanining that schedule should output
`value` when `t==time`. All the values for time must be sorted in
an increasing order. When t is between two times, e.g. `(time_a, value_a)`
and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
`interpolation(value_a, value_b, alpha)` where alpha is a fraction of
time passed between `time_a` and `time_b` for time `t`.
interpolation: lambda float, float, float: float
a function that takes value to the left and to the right of t according
to the `endpoints`. Alpha is the fraction of distance from left endpoint to
right endpoint that t has covered. See linear_interpolation for example.
outside_value: float
if the value is requested outside of all the intervals sepecified in
`endpoints` this value is returned. If None then AssertionError is
raised when outside value is requested.
"""
idxes = [e for e in endpoints]
assert idxes == sorted(idxes)
self._interpolation = interpolation
self._outside_value = outside_value
self._endpoints      = endpoints

def value(self, t):
"""See Schedule.value"""
for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
if l_t <= t and t < r_t:
alpha = float(t - l_t) / (r_t - l_t)
return self._interpolation(l, r, alpha)

# t does not belong to any of the pieces, so doom.
assert self._outside_value is not None
return self._outside_value

class LinearSchedule(object):
def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
"""Linear interpolation between initial_p and final_p over
schedule_timesteps. After this many timesteps pass final_p is
returned.
Parameters
----------
schedule_timesteps: int
Number of timesteps for which to linearly anneal initial_p
to final_p
initial_p: float
initial output value
final_p: float
final output value
"""
self.schedule_timesteps = schedule_timesteps
self.final_p            = final_p
self.initial_p          = initial_p

def value(self, t):
"""See Schedule.value"""
fraction  = min(float(t) / self.schedule_timesteps, 1.0)
return self.initial_p + fraction * (self.final_p - self.initial_p)

def compute_exponential_averages(variables, decay):
"""Given a list of tensorflow scalar variables
create ops corresponding to their exponential
averages
Parameters
----------
variables: [tf.Tensor]
List of scalar tensors.
Returns
-------
averages: [tf.Tensor]
List of scalar tensors corresponding to averages
of al the `variables` (in order)
apply_op: tf.runnable
Op to be run to update the averages with current value
of variables.
"""
averager = tf.train.ExponentialMovingAverage(decay=decay)
apply_op = averager.apply(variables)
return [averager.average(v) for v in variables], apply_op

def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
"""Minimized `objective` using `optimizer` w.r.t. variables in
`var_list` while ensure the norm of the gradients for each
variable is clipped to `clip_val`
"""

def initialize_interdependent_variables(session, vars_list, feed_dict):
"""Initialize a list of variables one at a time, which is useful if
initialization of some variables depends on initialization of the others.
"""
vars_left = vars_list
while len(vars_left) > 0:
new_vars_left = []
for v in vars_left:
try:
# If using an older version of TensorFlow, uncomment the line
# below and comment out the line after it.
#session.run(tf.initialize_variables([v]), feed_dict)
session.run(tf.variables_initializer([v]), feed_dict)
except tf.errors.FailedPreconditionError:
new_vars_left.append(v)
if len(new_vars_left) >= len(vars_left):
# This can happend if the variables all depend on each other, or more likely if there's
# another variable outside of the list, that still needs to be initialized. This could be
# detected here, but life's finite.
raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
else:
vars_left = new_vars_left

def get_wrapper_by_name(env, classname):
currentenv = env
while True:
if classname in currentenv.__class__.__name__:
return currentenv
elif isinstance(env, gym.Wrapper):
currentenv = currentenv.env
else:
raise ValueError("Couldn't find wrapper named %s"%classname)

class ReplayBuffer(object):
def __init__(self, size, frame_history_len, lander=False):
"""This is a memory efficient implementation of the replay buffer.

The sepecific memory optimizations use here are:
- only store each frame once rather than k times
even if every observation normally consists of k last frames
- store frames as np.uint8 (actually it is most time-performance
to cast them back to float32 on GPU to minimize memory transfer
time)
- store frame_t and frame_(t+1) in the same buffer.

For the tipical use case in Atari Deep RL buffer with 1M frames the total
memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes

Warning! Assumes that returning frame of zeros at the beginning
of the episode, when there is less frames than `frame_history_len`,
is acceptable.

Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
frame_history_len: int
Number of memories to be retried for each observation.
"""
self.lander = lander

self.size = size
self.frame_history_len = frame_history_len

self.next_idx      = 0
self.num_in_buffer = 0

self.obs      = None
self.action   = None
self.reward   = None
self.done     = None

def can_sample(self, batch_size):
"""Returns true if `batch_size` different transitions can be sampled from the buffer."""
return batch_size + 1 <= self.num_in_buffer

def _encode_sample(self, idxes):
obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
act_batch      = self.action[idxes]
rew_batch      = self.reward[idxes]
next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)

return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask

def sample(self, batch_size):
"""Sample `batch_size` different transitions.

i-th sample transition is the following:

when observing `obs_batch[i]`, action `act_batch[i]` was taken,
after which reward `rew_batch[i]` was received and subsequent
observation  next_obs_batch[i] was observed, unless the epsiode
was done which is represented by `done_mask[i]` which is equal
to 1 if episode has ended as a result of that action.

Parameters
----------
batch_size: int
How many transitions to sample.

Returns
-------
obs_batch: np.array
Array of shape
(batch_size, img_h, img_w, img_c * frame_history_len)
and dtype np.uint8
act_batch: np.array
Array of shape (batch_size,) and dtype np.int32
rew_batch: np.array
Array of shape (batch_size,) and dtype np.float32
next_obs_batch: np.array
Array of shape
(batch_size, img_h, img_w, img_c * frame_history_len)
and dtype np.uint8
Array of shape (batch_size,) and dtype np.float32
"""
assert self.can_sample(batch_size)
idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
return self._encode_sample(idxes)

def encode_recent_observation(self):
"""Return the most recent `frame_history_len` frames.

Returns
-------
observation: np.array
Array of shape (img_h, img_w, img_c * frame_history_len)
and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
encodes frame at time `t - frame_history_len + i`
"""
assert self.num_in_buffer > 0
return self._encode_observation((self.next_idx - 1) % self.size)

def _encode_observation(self, idx):
end_idx   = idx + 1 # make noninclusive
start_idx = end_idx - self.frame_history_len
# this checks if we are using low-dimensional observations, such as RAM
# state, in which case we just directly return the latest RAM.
if len(self.obs.shape) == 2:
return self.obs[end_idx-1]
# if there weren't enough frames ever in the buffer for context
if start_idx < 0 and self.num_in_buffer != self.size:
start_idx = 0
for idx in range(start_idx, end_idx - 1):
if self.done[idx % self.size]:
start_idx = idx + 1
missing_context = self.frame_history_len - (end_idx - start_idx)
# if zero padding is needed for missing context
# or we are on the boundry of the buffer
if start_idx < 0 or missing_context > 0:
frames = [np.zeros_like(self.obs) for _ in range(missing_context)]
for idx in range(start_idx, end_idx):
frames.append(self.obs[idx % self.size])
return np.concatenate(frames, 2)
else:
# this optimization has potential to saves about 30% compute time \o/
img_h, img_w = self.obs.shape, self.obs.shape
return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)

def store_frame(self, frame):
"""Store a single frame in the buffer at the next available index, overwriting
old frames if necessary.

Parameters
----------
frame: np.array
Array of shape (img_h, img_w, img_c) and dtype np.uint8
the frame to be stored

Returns
-------
idx: int
Index at which the frame is stored. To be used for `store_effect` later.
"""
if self.obs is None:
self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.float32 if self.lander else np.uint8)
self.action   = np.empty([self.size],                     dtype=np.int32)
self.reward   = np.empty([self.size],                     dtype=np.float32)
self.done     = np.empty([self.size],                     dtype=np.bool)
self.obs[self.next_idx] = frame

ret = self.next_idx
self.next_idx = (self.next_idx + 1) % self.size
self.num_in_buffer = min(self.size, self.num_in_buffer + 1)

return ret

def store_effect(self, idx, action, reward, done):
"""Store effects of action taken after obeserving frame stored
at index idx. The reason `store_frame` and `store_effect` is broken
up into two functions is so that once can call `encode_recent_observation`
in between.

Paramters
---------
idx: int
Index in buffer of recently observed frame (returned by `store_frame`).
action: int
Action that was performed upon observing this frame.
reward: float
Reward that was received when the actions was performed.
done: bool
True if episode was finished after performing that action.
"""
self.action[idx] = action
self.reward[idx] = reward
self.done[idx]   = done

```