python source code of utils

import collections
import copy
import h5py
import numpy as np
import os
import tensorflow as tf

from rllab.envs.normalized_env import NormalizedEnv

from hgail.envs.vectorized_normalized_env import VectorizedNormalizedEnv

'''
Reward utils
'''

def batch_to_path_rewards(rewards, path_lengths):
    '''
    Args:
        - rewards: numpy array of shape (batch size, reward_dim)
        - path_lengths: list of lengths to be selected in groups from the row of rewards
    '''
    assert len(rewards) == sum(path_lengths)
    
    path_rewards = []
    s = 0
    for path_length in path_lengths:
        e = s + path_length
        path_rewards.append(rewards[s:e])
        s = e
    return path_rewards

def batch_timeseries_to_path_rewards(rewards, path_lengths):
    '''
    Converts dense array to list of lists each of corresponding len in path_lengths

    Args:
        - rewards: numpy array of shape (batch size, max sequence length, reward dim)
        - path_lengths: list of lengths to be selected from the rows of rewards
    '''
    assert len(rewards) == len(path_lengths)

    path_rewards = []
    for (i, path_length) in enumerate(path_lengths):
        path_rewards.append(rewards[i, :path_length])
    return path_rewards

class RewardHandler(object):
    
    def __init__(
            self, 
            use_env_rewards=True,
            critic_clip_low=-np.inf,
            critic_clip_high=np.inf,
            critic_initial_scale=1.,
            critic_final_scale=1.,
            recognition_initial_scale=1,
            recognition_final_scale=1.,
            augmentation_scale=1.,
            normalize_rewards=False,
            alpha=.01,
            max_epochs=10000,
            summary_writer=None):
        self.use_env_rewards = use_env_rewards
        self.critic_clip_low = critic_clip_low
        self.critic_clip_high = critic_clip_high

        self.critic_initial_scale = critic_initial_scale
        self.critic_final_scale = critic_final_scale
        self.critic_scale = critic_initial_scale
        
        self.recognition_initial_scale = recognition_initial_scale
        self.recognition_final_scale = recognition_final_scale
        self.recognition_scale = recognition_initial_scale

        self.augmentation_scale = augmentation_scale

        self.normalize_rewards = normalize_rewards
        self.alpha = alpha
        self.critic_reward_mean = 0.
        self.critic_reward_var = 1.
        self.recog_reward_mean = 0.
        self.recog_reward_var = 1.
        
        self.step = 0
        self.max_epochs = max_epochs
        self.summary_writer = summary_writer

    def _update_reward_estimate(self, rewards, reward_type):
        # unpack
        a = self.alpha
        mean = self.critic_reward_mean if reward_type == 'critic' else self.recog_reward_mean
        var = self.critic_reward_var if reward_type == 'critic' else self.recog_reward_var

        # update the reward mean using the mean of the rewards
        new_mean = (1 - a) * mean + a * np.mean(rewards)
        # update the variance with the mean of the individual variances
        new_var = (1 - a) * var + a * np.mean((rewards - mean) ** 2)

        # update class members
        if reward_type == 'critic':
            self.critic_reward_mean = new_mean
            self.critic_reward_var = new_var
        else:
            self.recog_reward_mean = new_mean
            self.recog_reward_var = new_var

    def _normalize_rewards(self, rewards, reward_type):
        self._update_reward_estimate(rewards, reward_type)
        var = self.critic_reward_var if reward_type == 'critic' else self.recog_reward_var
        return rewards / (np.sqrt(var) + 1e-8)

    def _update_scales(self):

        self.step += 1
        frac = np.minimum(self.step / self.max_epochs, 1)
        self.critic_scale = self.critic_initial_scale \
            + frac * (self.critic_final_scale - self.critic_initial_scale)
        self.recognition_scale = self.recognition_initial_scale \
            + frac * (self.recognition_final_scale - self.recognition_initial_scale)

    def merge(
            self, 
            paths, 
            critic_rewards=None, 
            recognition_rewards=None):
        """
        Add critic and recognition rewards to path rewards based on settings
        
        Args:
            paths: list of dictionaries as described in process_samples
            critic_rewards: list of numpy arrays of equal shape as corresponding path['rewards']
            recognition_rewards: same as critic rewards
        """
        # update relative reward scales
        self._update_scales()

        # combine the different rewards
        for (i, path) in enumerate(paths):

            shape = np.shape(path['rewards'])

            # env rewards
            if self.use_env_rewards:
                path['rewards'] = np.float32(path['rewards'])
            else:
                path['rewards'] = np.zeros(shape, dtype=np.float32)

            # critic rewards
            if critic_rewards is not None:
                critic_rewards[i] = np.clip(critic_rewards[i], self.critic_clip_low, self.critic_clip_high)
                if self.normalize_rewards:
                    critic_rewards[i] = self._normalize_rewards(
                        critic_rewards[i], reward_type='critic')
                path['rewards'] += self.critic_scale * np.reshape(critic_rewards[i], shape)

            # recognition rewards
            if recognition_rewards is not None:
                if self.normalize_rewards:
                    recognition_rewards[i] = self._normalize_rewards(
                        recognition_rewards[i], reward_type='recognition')
                path['rewards'] += self.recognition_scale * np.reshape(recognition_rewards[i], shape)

        # optionally write a summary
        self._log_merge()

        return paths

    def _log_merge(self):
        if self.summary_writer is not None:

            summary = tf.Summary(value=[
                tf.Summary.Value(tag="reward_handler/critic_reward_mean", simple_value=self.critic_reward_mean),
                tf.Summary.Value(tag="reward_handler/critic_reward_var", simple_value=self.critic_reward_var), 
                tf.Summary.Value(tag="reward_handler/recognition_reward_mean", simple_value=self.recog_reward_mean), 
                tf.Summary.Value(tag="reward_handler/recognition_reward_var", simple_value=self.recog_reward_var),
                tf.Summary.Value(tag="reward_handler/critic_scale", simple_value=self.critic_scale),
                tf.Summary.Value(tag="reward_handler/recognition_scale", simple_value=self.recognition_scale),
            ])
            self.summary_writer.add_summary(summary, self.step)
            self.summary_writer.flush()

'''
Data utils
'''

class ActionNormalizer(object):
    
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std
        
    def normalize(self, act):
        act = (act - self.mean) / self.std
        return act

    def unnormalize(self, act):
        act = act * self.std + self.mean
        return act
        
    def __call__(self, act):
        return self.normalize(act)

class ActionRangeNormalizer(object):
    '''
    Converts from [low,high] range to [-1,1] range
    It's the inverse of a normalizing wrapper around an environment
    This should be applied to real data, where low and high are the bounds 
    within the environment. The reason is that the agent actions should be
    output in the range [-1,1], then mapped to the actual ranges by the 
    environement wrapper `normalize`. Thus, we want the real data to go 
    through the inverse mapping. From the environment bounds to the [-1,1].
    '''
    
    def __init__(self, low, high):
        low = np.array(low)
        high = np.array(high)
        self.half_range = (high - low) / 2.
        self.mean = (high + low) / 2.
    
    def normalize(self, act):
        act = (act - self.mean) / self.half_range
        act = np.clip(act, -1, 1)
        return act

    def __call__(self, act):
        return self.normalize(act)

def load_dataset(filepath, maxsize=None):
    f = h5py.File(filepath, 'r')
    d = dict()
    for key in f.keys():
        if maxsize is None:
            d[key] = f[key].value
        else:
            d[key] = f[key].value[:maxsize]
    return d

def compute_n_batches(n_samples, batch_size):
    n_batches = int(n_samples / batch_size)
    if n_samples % batch_size != 0:
        n_batches += 1
    return n_batches

def select_batch_idxs(start_idx, batch_size, min_idx, max_idx):
    end_idx = start_idx + batch_size
    end_idx = min(end_idx, max_idx)
    idxs = np.arange(start_idx, end_idx, dtype=int)

    # if too few samples selected, then randomly select the rest from the full range
    if len(idxs) < batch_size:
        n_additional = batch_size - len(idxs)
        additional_idxs = np.random.randint(low=min_idx, high=max_idx, size=n_additional)
        idxs = np.hstack((idxs, additional_idxs))

    return idxs, end_idx

def save_params(output_dir, params, epoch, max_to_keep=None):
    # make sure output_dir exists
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # save 
    output_filepath = os.path.join(output_dir, 'itr_{}'.format(epoch))
    np.savez(output_filepath, params=params)

    # delete files if in excess of max_to_keep
    if max_to_keep is not None:
        files = [os.path.join(output_dir, f) 
                for f in os.listdir(output_dir) 
                if os.path.isfile(os.path.join(output_dir, f)) 
                and 'itr_' in f]
        sorted_files = sorted(files, key=os.path.getmtime, reverse=True)
        if len(sorted_files) > max_to_keep:
            for filepath in sorted_files[max_to_keep:]:
                os.remove(filepath)

def load_params(filepath):
    return np.load(filepath)['params'].item()

'''
numpy utils
'''

def to_onehot(values, dim=None):
    assert len(values.shape) == 2

    if dim is None:
        dim = np.max(values) + 1

    onehot = np.zeros((len(values), dim))
    onehot[np.arange(len(values)), values.reshape(-1)] = 1
    return onehot

def pad_tensor(x, max_len, axis):
    pad_widths = [(0,0) for _ in range(len(x.shape))]
    pad_widths[axis] = (0, max_len - x.shape[axis])
    return np.pad(x, (pad_widths), mode='constant')

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(logits, axis=-1):
    shape = logits.shape
    logits = logits.astype(np.float128).reshape(-1, shape[-1])
    x = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
    probs = x / np.sum(x, axis=-1, keepdims=True)
    invalid_idxs = np.where(np.sum(probs, axis=-1, keepdims=True) > 1.)[0]
    probs[invalid_idxs] -= (np.sum(probs[invalid_idxs], axis=-1, keepdims=True) - 1 + 1e-8) / probs.shape[-1]
    probs = probs.astype(np.float64)
    return probs.reshape(shape)

def multiple_pval_multinomial(probs):
    ps = []
    for prob in probs:
        p = np.random.multinomial(1, prob)
        ps.append(p)
    return np.array(ps)

def tile_concatenate(tiling_value, sequence):
    tiling = np.tile(np.expand_dims(tiling_value, 1), (1, sequence.shape[1], 1))
    sequence = np.concatenate((sequence, tiling), axis=-1)
    return sequence

def closest_factors(n):
    s = np.floor(np.sqrt(n))
    while n % s != 0:
        s -= 1
    return int(s), int(n // s)

def pad_stride_concat(a, window_left, stride=1):
    n_samples, input_dim = np.shape(a)
    padding = np.zeros((window_left, input_dim))
    a = np.concatenate((padding, a), axis=0)

    out = np.zeros((n_samples, (window_left + 1) * input_dim))
    for i in range(window_left, len(a)):
        out[i-window_left] = a[i-window_left:i+1].flatten()

    return out

def probabilistic_round(a):
    p = np.random.uniform(size=a.shape)
    frac = a % 1
    up = np.where(frac > p)
    down = np.where(frac <= p)
    a[up] = np.ceil(a[up])
    a[down] = np.floor(a[down])
    return a

def subselect_dict_list_idxs(d_l, key, idxs_l):
    for d, idxs in zip(d_l, idxs_l):
        sub_d = dict()
        for (k,v) in d[key].items():
            sub_d[k] = v[idxs]
        d[key] = sub_d

def flatten(arr):
    '''reshape to (-1, lastdim)'''
    return np.reshape(arr, (-1, np.shape(arr)[-1]))

'''
Replay Memory
'''

class ReplayMemory(object):

    def __init__(self, maxsize=None):
        self.maxsize = maxsize
        self.mem = []

    def add(self, paths):
        self.mem.extend(paths)
        if self.maxsize:
            self.mem = self.mem[-self.maxsize:]

    def sample(self, size):
        return np.random.choice(self.mem, size)

class KeyValueReplayMemory(object):

    def __init__(self, maxsize=None):
        self.maxsize = maxsize
        self.mem = collections.defaultdict(list)

    def add(self, keys, values):
        '''
        Adds keys from values to memory

        Args:
            - keys: the keys to add, list of hashable
            - values: dict containing each key in keys
        '''
        n_samples = len(values[keys[0]])
        for key in keys:
            assert len(values[key]) == n_samples, 'n_samples from each key must match'
            self.mem[key].extend(values[key])
            if self.maxsize:
                self.mem[key] = self.mem[key][-self.maxsize:]

    def sample(self, keys, size):
        '''
        Sample a batch of size for each key and return as a dict

        Args:
            - keys: list of keys
            - size: number of samples to select
        '''
        sample = dict()
        n_samples = len(self.mem[keys[0]])
        idxs = np.random.randint(0, n_samples, size)
        for key in keys:
            sample[key] = np.take(self.mem[key], idxs, axis=0)
        return sample

'''
rllab utils
'''
def extract_wrapped_env(env, typ):
    while not isinstance(env, typ):
        # descend to wrapped env
        if hasattr(env, 'wrapped_env'):
            env = env.wrapped_env
        # not the desired type, and has no wrapped env, return None
        else:
            return None
    # reaches this point, then the env is of the desired type, return it
    return env

def extract_normalizing_env(env):
    normalizing_env = extract_wrapped_env(env, NormalizedEnv)
    if normalizing_env is None:
        normalizing_env = extract_wrapped_env(env, VectorizedNormalizedEnv)
    return normalizing_env