python source code of ilqr

import numpy as np
import torch
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, writers

np.random.seed(0)

def cost_dz(R_z, z, z_goal):
    # compute the first-order deravative of latent cost w.r.t z
    z_diff = np.expand_dims(z - z_goal, axis=-1)
    return np.squeeze(2 * np.matmul(R_z, z_diff))

def cost_du(R_u, u):
    # compute the first-order deravative of latent cost w.r.t u
    return np.atleast_1d(np.squeeze(2 * np.matmul(R_u, np.expand_dims(u, axis=-1))))

def cost_dzz(R_z):
    # compute the second-order deravative of latent cost w.r.t z
    return 2 * R_z

def cost_duu(R_u):
    # compute the second-order deravative of latent cost w.r.t u
    return 2 * R_u

def cost_duz(z, u):
    # compute the second-order deravative of latent cost w.r.t uz
    return np.zeros((u.shape[-1], z.shape[-1]))

def latent_cost(R_z, R_u, z_seq, z_goal, u_seq):
    z_diff = np.expand_dims(z_seq - z_goal, axis=-1)
    cost_z = np.squeeze(np.matmul(
                np.matmul(z_diff.transpose((0,2,1)), R_z), z_diff))
    u_seq_reshaped = np.expand_dims(u_seq, axis=-1)
    cost_u = np.squeeze(np.matmul(
                np.matmul(u_seq_reshaped.transpose((0,2,1)), R_u), u_seq_reshaped))
    return np.sum(cost_z) + np.sum(cost_u)

def one_step_back(R_z, R_u, z, u, z_goal, A, B, V_prime_next_z, V_prime_next_zz, mu_inv_regulator):
    """
    V_prime_next_z: first order derivative of the value function at time step t+1
    V_prime_next_zz: second order derivative of the value function at time tep t+1
    A: derivative of F(z, u) w.r.t z at z_bar_t, u_bar_t
    B: derivative of F(z, u) w.r.t u at z_bar_t, u_bar_t
    """
    # compute Q_z, Q_u, Q_zz, Q_uu, Q_uz using cost function, A, B and V
    Q_z = cost_dz(R_z, z, z_goal) + np.matmul(A.transpose(), V_prime_next_z)
    Q_u = cost_du(R_u, u) + np.matmul(B.transpose(), V_prime_next_z)
    Q_zz = cost_dzz(R_z) + np.matmul(np.matmul(A.transpose(), V_prime_next_zz), A)
    Q_uz = cost_duz(z, u) + np.matmul(np.matmul(B.transpose(), V_prime_next_zz), A)
    Q_uu = cost_duu(R_u) + np.matmul(np.matmul(B.transpose(), V_prime_next_zz), B)

    # compute k and K matrix, add regularization to Q_uu
    Q_uu_regularized = Q_uu + mu_inv_regulator * np.eye(Q_uu.shape[0])
    Q_uu_in = np.linalg.inv(Q_uu_regularized)
    k = -np.matmul(Q_uu_in, Q_u)
    K = -np.matmul(Q_uu_in, Q_uz)

    # compute V_z and V_zz using k and K
    V_prime_z = Q_z + np.matmul(Q_uz.transpose(), k)
    V_prime_zz = Q_zz + np.matmul(Q_uz.transpose(), K)
    return k, K, V_prime_z, V_prime_zz

def backward(R_z, R_u, z_seq, u_seq, z_goal, A_seq, B_seq, inv_regulator):
    """
    do the backward pass
    return a sequence of k and K matrices
    """
    # first and second order derivative of the value function at the last time step
    V_prime_next_z = cost_dz(R_z, z_seq[-1], z_goal)
    V_prime_next_zz = cost_dzz(R_z)
    k, K = [], []
    act_seq_len = len(u_seq)
    for t in reversed(range(act_seq_len)):
        k_t, K_t, V_prime_z, V_prime_zz = one_step_back(R_z, R_u, z_seq[t], u_seq[t], z_goal, A_seq[t], B_seq[t],
                                                        V_prime_next_z, V_prime_next_zz, inv_regulator)
        k.insert(0, k_t)
        K.insert(0, K_t)
        V_prime_next_z, V_prime_next_zz = V_prime_z, V_prime_zz
    return k, K

def forward(z_seq, u_seq, k, K, dynamics, alpha):
    """
    update the trajectory, given k and K
    !!!! update using the linearization matricies (A and B), not the learned dynamics
    """
    z_seq_new = []
    z_seq_new.append(z_seq[0])
    u_seq_new = []
    for i in range(0, len(u_seq)):
        u_new = u_seq[i] + alpha * k[i] + np.matmul(K[i], z_seq_new[i] - z_seq[i])
        u_seq_new.append(u_new)
        with torch.no_grad():
            z_new = dynamics(torch.from_numpy(z_seq_new[i]).unsqueeze(0),
                                      torch.from_numpy(u_new).unsqueeze(0))[0].mean
        z_seq_new.append(z_new.squeeze().numpy())
    return np.array(z_seq_new), np.array(u_seq_new)

# def forward(u_seq, k_seq, K_seq, A_seq, B_seq, alpha):
#     """
#     update the trajectory, given k and K
#     !!!! update using the linearization matricies (A and B), not the learned dynamics
#     """
#     u_new_seq = []
#     plan_len = len(u_seq)
#     z_dim = K_seq[0].shape[1]
#     for i in range(0, plan_len):
#         if i == 0:
#             z_delta = np.zeros(z_dim)
#         else:
#             z_delta = np.matmul(A_seq[i-1], z_delta) + np.matmul(B_seq[i-1], u_delta)
#         u_delta = alpha * (k_seq[i] + np.matmul(K_seq[i], z_delta))
#         u_new_seq.append(u_seq[i] + u_delta)
#     return np.array(u_new_seq)

def get_x_data(mdp, state, config):
    image_data = mdp.render(state).squeeze()
    x_dim = config['obs_shape']
    if config['task'] == 'plane':
        x_dim = np.prod(x_dim)
        x_data = torch.from_numpy(image_data).double().view(x_dim).unsqueeze(0)
    elif config['task'] in ['swing', 'balance']:
        x_dim = np.prod(x_dim)
        x_data = np.vstack((image_data, image_data))
        x_data = torch.from_numpy(x_data).double().view(x_dim).unsqueeze(0)
    elif config['task'] in ['cartpole', 'threepole']:
        x_data = torch.zeros(size=(2,80,80))
        x_data[0, :, :] = torch.from_numpy(image_data)
        x_data[1, :, :] = torch.from_numpy(image_data)
        x_data = x_data.unsqueeze(0)
    return x_data

def update_horizon_start(mdp, s, u, encoder, config):
    s_next = mdp.transition_function(s, u)
    if config['task'] == 'plane':
        x_next = get_x_data(mdp, s_next, config)
    elif config['task'] in ['swing', 'balance']:
        obs = mdp.render(s).squeeze()
        obs_next = mdp.render(s_next).squeeze()
        obs_stacked = np.vstack((obs, obs_next))
        x_dim = np.prod(config['obs_shape'])
        x_next = torch.from_numpy(obs_stacked).view(x_dim).unsqueeze(0).double()
    elif config['task'] in ['cartpole', 'threepole']:
        obs = mdp.render(s).squeeze()
        obs_next = mdp.render(s_next).squeeze()
        x_next = torch.zeros(size=config['obs_shape'])
        x_next[0, :, :] = torch.from_numpy(obs)
        x_next[1, :, :] = torch.from_numpy(obs_next)
        x_next = x_next.unsqueeze(0)
    with torch.no_grad():
        z_next = encoder(x_next).mean
    return s_next, z_next.squeeze().numpy()

def random_uniform_actions(mdp, plan_len):
    # create a trajectory of random actions
    random_actions = []
    for i in range(plan_len):
        action = mdp.sample_random_action()
        random_actions.append(action)
    return np.array(random_actions)

def random_extreme_actions(mdp, plan_len):
    # create a trajectory of extreme actions
    extreme_actions = []
    for i in range(plan_len):
        action = mdp.sample_extreme_action()
        extreme_actions.append(action)
    return np.array(extreme_actions)

def random_actions_trajs(mdp, num_uniform, num_extreme, plan_len):
    actions_trajs = []
    for i in range(num_uniform):
        actions_trajs.append(random_uniform_actions(mdp, plan_len))
    for j in range(num_extreme):
        actions_trajs.append(random_extreme_actions(mdp, plan_len))
    return actions_trajs

def refresh_actions_trajs(actions_trajs, traj_opt_id, mdp, length, num_uniform, num_extreme):
    for traj_id in range(len(actions_trajs)):
        if traj_id == traj_opt_id:
            actions_trajs[traj_id] = actions_trajs[traj_id][1:]
            if len(actions_trajs[traj_id]) < length:
                # Duplicate last action.
                actions_trajs[traj_id] = \
                    np.append(actions_trajs[traj_id], actions_trajs[traj_id][-1].reshape(1,-1), axis=0)
            continue
        if traj_id < num_uniform:
            actions_trajs[traj_id] = random_uniform_actions(mdp, length)
        else:
            actions_trajs[traj_id] = random_extreme_actions(mdp, length)
    return actions_trajs

def update_seq_act(z_seq, z_start, u_seq, k, K, dynamics):
    """
    update the trajectory, given k and K
    """
    z_new = z_start
    u_seq_new = []
    for i in range(0, len(u_seq)):
        u_new = u_seq[i] + k[i] + np.matmul(K[i], (z_new - z_seq[i]))
        with torch.no_grad():
            z_new = dynamics(torch.from_numpy(z_new).view(1, -1),
                                      torch.from_numpy(u_new).view(1, -1))[0].mean
            z_new = z_new.squeeze().numpy()
        u_seq_new.append(u_new)
    return np.array(u_seq_new)

def compute_latent_traj(z_start, u_seq, dynamics):
    plan_len = len(u_seq)
    z_seq = [z_start]
    for i in range(plan_len):
        z = torch.from_numpy(z_seq[i]).view(1, -1).double()
        u = torch.from_numpy(u_seq[i]).view(1, -1).double()
        with torch.no_grad():
            z_next = dynamics(z, u)[0].mean
        z_seq.append(z_next.squeeze().numpy())
    return z_seq

def jacobian(dynamics, z, u):
    """
    compute the jacobian of F(z,u) w.r.t z, u
    """
    z_dim = z.shape[0]
    u_dim = u.shape[0]
    z_tensor = torch.from_numpy(z).view(1,-1).double()
    u_tensor = torch.from_numpy(u).view(1,-1).double()
    if dynamics.armotized:
        _, A, B = dynamics(z_tensor, u_tensor)
        return A.squeeze().view(z_dim, z_dim).numpy(), B.squeeze().view(z_dim, u_dim).numpy()
    z_tensor, u_tensor = z_tensor.squeeze().repeat(z_dim, 1), u_tensor.squeeze().repeat(z_dim, 1)
    z_tensor = z_tensor.detach().requires_grad_(True)
    u_tensor = u_tensor.detach().requires_grad_(True)
    z_next = dynamics(z_tensor, u_tensor)[0].mean
    grad_inp = torch.eye(z_dim)
    A, B = torch.autograd.grad(z_next, [z_tensor, u_tensor], [grad_inp, grad_inp])
    return A.numpy(), B.numpy()

def seq_jacobian(dynamics, z_seq, u_seq):
    """
    compute the jacobian w.r.t each pair in the trajectory
    """
    A_seq, B_seq = [], []
    horizon = len(u_seq)
    for i in range(horizon):
        z, u = z_seq[i], u_seq[i]
        A, B = jacobian(dynamics, z, u)
        A_seq.append(A)
        B_seq.append(B)
    return A_seq, B_seq

def save_traj(images, image_goal, gif_path, task):
    # save trajectory as gif file
    fig, aa = plt.subplots(1, 2)
    m1 = aa[0].matshow(
        images[0], cmap=plt.cm.gray, vmin=0., vmax=1.)
    aa[0].set_title('Time step 0')
    aa[0].set_yticklabels([])
    aa[0].set_xticklabels([])
    m2 = aa[1].matshow(
        image_goal, cmap=plt.cm.gray, vmin=0., vmax=1.)
    aa[1].set_title('goal')
    aa[1].set_yticklabels([])
    aa[1].set_xticklabels([])
    fig.tight_layout()

    def updatemat2(t):
        m1.set_data(images[t])
        aa[0].set_title('Time step ' + str(t))
        m2.set_data(image_goal)
        return m1, m2

    frames = len(images)
    if task == 'plane':
        fps = 2
    else:
        fps = 20

    anim = FuncAnimation(
        fig, updatemat2, frames=frames, interval=200, blit=True, repeat=True)
    Writer = writers['imagemagick']  # animation.writers.avail
    writer = Writer(fps=fps, metadata=dict(artist='Me'), bitrate=1800)

    anim.save(gif_path, writer=writer)