python source code of refiner

"""
Similar to RunModel, but fine-tunes over time on openpose output.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from .ops import keypoint_l1_loss
from .models import Encoder_resnet, Encoder_fc3_dropout
from .tf_smpl.batch_lbs import batch_rodrigues
from .tf_smpl.batch_smpl import SMPL
from .tf_smpl.projection import batch_orth_proj_idrot
from .util.renderer import SMPLRenderer, draw_skeleton
from .util.image import unprocess_image
import time
from os.path import exists

import tensorflow as tf
import numpy as np


class Refiner(object):
    def __init__(self, config, num_frames, sess=None):
        """
        Args:
          config,,
        """
        # Config + path
        if not config.load_path:
            raise Exception(
                "[!] You should specify `load_path` to load a pretrained model")
        if not exists(config.load_path + '.index'):
            print('%s doesnt exist..' % config.load_path)
            import ipdb
            ipdb.set_trace()
        self.config = config
        self.load_path = config.load_path
        self.num_frames = num_frames

        self.data_format = config.data_format
        self.smpl_model_path = config.smpl_model_path

        # Visualization for fitting
        self.viz = config.viz
        self.viz_sub = 10

        # Loss & Loss weights:
        self.e_lr = config.e_lr

        self.e_loss_weight = config.e_loss_weight
        self.shape_loss_weight = config.shape_loss_weight
        self.joint_smooth_weight = config.joint_smooth_weight
        self.camera_smooth_weight = config.camera_smooth_weight
        self.keypoint_loss = keypoint_l1_loss
        self.init_pose_loss_weight = config.init_pose_loss_weight

        # Data
        self.batch_size = num_frames
        self.img_size = config.img_size

        input_size = (self.batch_size, self.img_size, self.img_size, 3)
        self.images_pl = tf.placeholder(tf.float32, shape=input_size)
        self.img_feat_pl = tf.placeholder(tf.float32, shape=(self.batch_size, 2048))
        self.img_feat_var = tf.get_variable("img_feat_var", dtype=tf.float32, shape=(self.batch_size, 2048))        
        kp_size = (self.batch_size, 19, 3)
        self.kps_pl = tf.placeholder(tf.float32, shape=kp_size)

        # Camera type!
        self.num_cam = 3
        self.proj_fn = batch_orth_proj_idrot
        self.num_theta = 72  # 24 * 3
        self.total_params = self.num_theta + self.num_cam + 10

        # Model spec
        # For visualization
        if self.viz:
            self.renderer = SMPLRenderer(img_size=self.img_size, face_path=config.smpl_face_path)

        # Instantiate SMPL
        self.smpl = SMPL(self.smpl_model_path)

        self.theta0_pl_shape = [self.batch_size, self.total_params]
        self.theta0_pl = tf.placeholder_with_default(
            self.load_mean_param(), shape=self.theta0_pl_shape, name='theta0')

        # Optimization space.
        self.refine_inpose = config.refine_inpose
        if self.refine_inpose:
            self.theta_pl = tf.placeholder(tf.float32, shape=self.theta0_pl_shape, name='theta_pl')
            self.theta_var = tf.get_variable("theta_var", dtype=tf.float32, shape=self.theta0_pl_shape)

        # For ft-loss
        self.shape_pl = tf.placeholder_with_default(tf.zeros(10), shape=(10,), name='beta0')
        # For stick-to-init-pose loss:
        self.init_pose_pl = tf.placeholder_with_default(tf.zeros([num_frames, 72]), shape=(num_frames, 72), name='pose0')
        self.init_pose_weight_pl = tf.placeholder_with_default(tf.ones([num_frames, 1]), shape=(num_frames, 1), name='pose0_weights')
        # For camera loss
        self.scale_factors_pl = tf.placeholder_with_default(tf.ones([num_frames]), shape=(num_frames), name='scale_factors')
        self.offsets_pl = tf.placeholder_with_default(tf.zeros([num_frames, 2]), shape=(num_frames, 2), name='offsets')

        # Build model!
        self.ief = config.ief
        if self.ief:
            self.num_stage = config.num_stage
            self.build_refine_model()
        else:
            print('never here')
            import ipdb
            ipdb.set_trace()
            
        all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        # Exclude the new variable.
        all_vars_filtered = [v for v in all_vars if ('img_feat_var' not in v.name) and ('theta_var' not in v.name)]
        self.saver = tf.train.Saver(all_vars_filtered)

        if sess is None:
            self.sess = tf.Session()            
        else:
            self.sess = sess

        new_vars = [v for v in all_vars if ('img_feat_var' in v.name) or ('theta_var' in v.name)]
        self.sess.run(tf.variables_initializer(new_vars))

        # coord = tf.train.Coordinator()
        # threads = tf.train.start_queue_runners(sess=self.sess, coord=coord)
        self.prepare()

    def load_mean_param(self):
        mean = np.zeros((1, self.total_params))
        mean[0, 0] = 0.9  # This is scale.

        mean = tf.constant(mean, tf.float32)

        self.mean_var = tf.Variable(
            mean, name="mean_param", dtype=tf.float32, trainable=True)
        # self.E_var.append(self.mean_var)
        init_mean = tf.tile(self.mean_var, [self.batch_size, 1])
        # 85D consists of [cam (3), pose (72), shapes (10)]
        # cam is [scale, tx, ty]
        return init_mean

    def prepare(self):
        print('Restoring checkpoint %s..' % self.load_path)
        self.saver.restore(self.sess, self.load_path)
        self.mean_value = self.sess.run(self.mean_var)

    def build_refine_model(self):
        img_enc_fn = Encoder_resnet
        threed_enc_fn = Encoder_fc3_dropout
        
        self.img_feat, self.E_var = img_enc_fn(
            self.images_pl,
            is_training=False,
            reuse=False)

        self.set_img_feat_var = self.img_feat_var.assign(self.img_feat_pl)

        # Start loop
        self.all_verts = []
        self.all_kps = []
        self.all_cams = []
        self.all_Js = []
        self.all_Jsmpl = []
        self.final_thetas = []
        theta_prev = self.theta0_pl
        for i in np.arange(self.num_stage):
            print('Iteration %d' % i)
            # ---- Compute outputs
            state = tf.concat([self.img_feat_var, theta_prev], 1)

            if i == 0:
                delta_theta, threeD_var = threed_enc_fn(
                    state,
                    num_output=self.total_params,
                    is_training=False,
                    reuse=False)
                self.E_var.append(threeD_var)
            else:
                delta_theta, _ = threed_enc_fn(
                    state,
                    num_output=self.total_params,
                    is_training=False,
                    reuse=True)
            # Compute new theta
            theta_here = theta_prev + delta_theta
            # cam = N x 3, pose N x self.num_theta, shape: N x 10
            cams = theta_here[:, :self.num_cam]
            poses = theta_here[:, self.num_cam:(self.num_cam + self.num_theta)]
            shapes = theta_here[:, (self.num_cam + self.num_theta):]
            # Rs_wglobal is Nx24x3x3 rotation matrices of poses
            verts, Js, pred_Rs = self.smpl(shapes, poses, get_skin=True)
            Jsmpl = self.smpl.J_transformed

            # Project to 2D!
            pred_kp = self.proj_fn(Js, cams, name='proj_2d_stage%d' % i)
            self.all_verts.append(verts)
            self.all_kps.append(pred_kp)
            self.all_cams.append(cams)
            self.all_Js.append(Js)
            self.all_Jsmpl.append(Jsmpl)
            # save each theta.
            self.final_thetas.append(theta_here)
            # Finally)update to end iteration.
            theta_prev = theta_here

        # Compute everything with the final theta.
        if self.refine_inpose:
            self.set_theta_var = self.theta_var.assign(self.theta_pl)
            theta_final = self.theta_var
        else:
            theta_final = theta_here

        cams = theta_final[:, :self.num_cam]
        poses = theta_final[:, self.num_cam:(self.num_cam + self.num_theta)]
        shapes = theta_final[:, (self.num_cam + self.num_theta):]
        # Rs_wglobal is Nx24x3x3 rotation matrices of poses
        verts, Js, pred_Rs = self.smpl(shapes, poses, get_skin=True)
        Jsmpl = self.smpl.J_transformed
        # Project to 2D!
        pred_kp = self.proj_fn(Js, cams, name='proj_2d_stage%d' % i)

        self.all_verts.append(verts)
        self.all_kps.append(pred_kp)
        self.all_cams.append(cams)
        self.all_Js.append(Js)
        self.all_Jsmpl.append(Jsmpl)
        # save each theta.
        self.final_thetas.append(theta_final)

        # Compute new losses!!
        self.e_loss_kp = self.e_loss_weight * self.keypoint_loss(self.kps_pl,
                                                                 pred_kp)
        # Beta variance should be low!
        self.loss_shape = self.shape_loss_weight * shape_variance(shapes, self.shape_pl)
        self.loss_init_pose = self.init_pose_loss_weight * init_pose(pred_Rs, self.init_pose_pl, weights=self.init_pose_weight_pl)
        # Endpoints should be smooth!!
        self.loss_joints = self.joint_smooth_weight * joint_smoothness(Js)
        # Camera should be smooth
        self.loss_camera = self.camera_smooth_weight * camera_smoothness(cams, self.scale_factors_pl, self.offsets_pl, img_size=self.config.img_size)

        self.total_loss = self.e_loss_kp + self.loss_shape + self.loss_joints + self.loss_init_pose + self.loss_camera

        # Setup optimizer
        print('Setting up optimizer..')
        self.optimizer = tf.train.AdamOptimizer
        e_optimizer = self.optimizer(self.e_lr)

        if self.refine_inpose:
            self.e_opt = e_optimizer.minimize(self.total_loss, var_list=[self.theta_var])
        else:
            self.e_opt = e_optimizer.minimize(self.total_loss, var_list=[self.img_feat_var])

        print('Done initializing the model!')


    def predict(self, images, kps, scale_factors, offsets):
        """
        images: num_batch, img_size, img_size, 3
        kps: num_batch x 19 x 3
        Preprocessed to range [-1, 1]

        scale_factors, offsets: used to preprocess the bbox

        Runs the model with images.
        """
        ## Initially, get the encoding of images:
        feed_dict = { self.images_pl: images }
        fetch_dict = {'img_feats': self.img_feat}

        img_feats = self.sess.run(self.img_feat, feed_dict)

        feed_dict = {
            self.img_feat_pl: img_feats,
            self.kps_pl: kps,
        }

        self.sess.run(self.set_img_feat_var, feed_dict)

        if self.refine_inpose:
            # Take -2 bc that's the actual theta (-1 is still placeholder)
            use_res = -2
        else:
            use_res = -1
        fetch_dict = {
            'theta': self.final_thetas[use_res],
            'joints': self.all_kps[use_res],
            'verts': self.all_verts[use_res],
        }

        init_result = self.sess.run(fetch_dict, feed_dict)

        shapes = init_result['theta'][:, -10:]
        # Save mean shape of this trajectory:
        mean_shape = np.mean(shapes, axis=0)
        feed_dict[self.shape_pl] = mean_shape

        # Save initial pose output:
        init_pose = init_result['theta'][:, 3:3+72]
        feed_dict[self.init_pose_pl] = init_pose
        # import ipdb; ipdb.set_trace()

        if self.refine_inpose:
            print('Doing optimization in pose space!!')
            feed_dict[self.theta_pl] = init_result['theta']
            self.sess.run(self.set_theta_var, feed_dict)

        if self.config.use_weighted_init_pose:
            print('Using weighted init pose!')
            # Compute weights according to op match.
            gt_kps = np.stack(kps)
            vis = gt_kps[:, :, 2, None]
            diff = vis * (gt_kps[:, :, :2] - init_result['joints'])
            # L2:
            error = np.linalg.norm(diff.reshape(diff.shape[0], -1), axis=1, ord=1)
            # N x 1
            weights = np.expand_dims(np.exp(-(error / error.max())**2), 1)

            feed_dict[self.init_pose_weight_pl] = weights

        # Add scale factor & offsets for camera loss.
        feed_dict[self.scale_factors_pl] = scale_factors
        feed_dict[self.offsets_pl] = offsets

        # All fetches:
        fetch_dict = {
            'joints': self.all_kps[-1],
            'verts': self.all_verts[-1],
            'cams': self.all_cams[-1],
            'joints3d': self.all_Js[-1],
            'theta': self.final_thetas[-1],
            'total_loss': self.total_loss,
            'loss_kp': self.e_loss_kp,
            'loss_shape': self.loss_shape,
            'loss_joints': self.loss_joints,
            'loss_camera': self.loss_camera,
            'loss_init_pose': self.loss_init_pose,
            'optim': self.e_opt,
        }

        if self.viz:
            # Precompute op imgs bc they are constant
            crops = [
                unprocess_image(image) for image in images[::self.viz_sub]
            ]
            op_kps = [
                np.hstack((self.img_size * ((kp[:, :2] + 1) * 0.5), kp[:, 2, None]))
                for kp in kps[::self.viz_sub]
            ]
            op_imgs = [
                draw_skeleton(crop, kp, vis=kp[:, 2] > 0.1)
                for crop, kp in zip(crops, op_kps)
            ]
            op_img = np.hstack(op_imgs)

            proj_img, rend_img = self.visualize_seq(crops, init_result)
            import matplotlib.pyplot as plt
            plt.ion()
            plt.figure(1)
            plt.clf()
            plt.suptitle('init')
            plt.subplot(311)
            plt.imshow(op_img)
            plt.axis('off')
            plt.title('openpose joints')
            plt.subplot(312)
            plt.imshow(proj_img)
            plt.axis('off')
            plt.title('proj joints')
            plt.subplot(313)
            plt.imshow(rend_img)
            plt.axis('off')
            plt.title('rend verts')
            plt.pause(1e-3)
            import ipdb; ipdb.set_trace()

        all_loss_keys = ['loss_kp', 'loss_shape', 'loss_joints', 'loss_init_pose', 'loss_camera']
        tbegin = time.time()
        num_iter = self.config.num_refine
        loss_records = {}
        for step in xrange(num_iter):
            result = self.sess.run(fetch_dict, feed_dict)
            loss_keys = [key for key in all_loss_keys if key in result.keys()]
            total_loss = result['total_loss']
            msg_prefix = 'iter %d/%d, total_loss %.2g' % (step, num_iter, total_loss)
            msg_raw = ['%s: %.2g' % (key, result[key]) for key in loss_keys]
            print(msg_prefix + ' ' + ' ,'.join(msg_raw))

            if step == 0:
                for key in loss_keys:
                    loss_records[key] = [result[key]]
            else:
                for key in loss_keys:
                    loss_records[key].append(result[key])

            # Do visualization
            if self.viz and step > 0 and step % 50 == 0:
                proj_img, rend_img = self.visualize_seq(crops, result)
                import matplotlib.pyplot as plt
                plt.ion()
                plt.figure(2)
                plt.clf()                
                plt.suptitle('iter %d' % step) 
                plt.subplot(311)
                plt.imshow(op_img)
                plt.axis('off')
                plt.title('openpose joints')
                plt.subplot(312)
                plt.imshow(proj_img)
                plt.axis('off')                
                plt.title('proj joints')
                plt.subplot(313)
                plt.imshow(rend_img)
                plt.axis('off')                                
                plt.title('rend verts')
                plt.pause(1e-3)
                import ipdb; ipdb.set_trace()                

        total_time = time.time() - tbegin
        print('Total time %g' % total_time)

        del result['optim']
        del result['verts']

        result['loss_records'] = loss_records

        return result

    def visualize_seq(self, crops, result):
        """
        For weight tuning,, see:
        first row, the original renders (or original kp)
        second row, open pose
        third row, proj kp
        For every 10th frame or something like this.
        """
        pred_kps = (result['joints'][::self.viz_sub] + 1) * 0.5 * self.img_size

        proj_imgs = [
            draw_skeleton(crop, kp) for crop, kp in zip(crops, pred_kps)
        ]
        proj_img = np.hstack(proj_imgs)

        t0 = time.time()
        # Use camera to figure out the z.
        cam_scales = result['theta'][:, :0]
        tzs = [500. / (0.5 * self.config.img_size * cam_s) for cam_s in cam_scales]
        # rend_imgs = [
        #     self.renderer(vert + np.array([0, 0, tz])) for (vert, tz) in zip(result['verts'][::self.viz_sub], tzs[::self.viz_sub])
        # ]
        rend_imgs = [
            self.renderer(vert + np.array([0, 0, 6])) for vert in result['verts'][::self.viz_sub]
        ]        
        rend_img = np.hstack(rend_imgs)
        t1 = time.time()
        print('Took %f sec to render %d imgs' % (t1 - t0, len(rend_imgs)))

        return proj_img, rend_img

# All the  loss functions.

def shape_variance(shapes, target_shape=None):
    # Shapes is F x 10
    # Compute variance.
    if target_shape is not None:
        N = tf.shape(shapes)[0]
        target_shapes = tf.tile(tf.expand_dims(target_shape, 0), [N, 1])
        return tf.losses.mean_squared_error(target_shapes, shapes)
    else:
        _, var = tf.nn.moments(shapes, axes=0)
        return tf.reduce_mean(var)


def pose_smoothness(poses, global_only=False):
    """
    # Poses is F x 24 x 3 x 3
    Computes \sum ||p_i - p_{i+1}||
    On the pose in Rotation matrices space.
    It compues the angle between the two rotations:
    (tr(R) - 1) / 2 = cos(theta)
    So penalize acos((tr(R) - 1) / 2) --> this nans
    So:
    minimize: (1 - tr(R_1*R_2')) / 2 = -cos(theta) of R_1*R_2'
    min at -1.
    """
    # These are F-1 x 24 x 3 x 3 (Ok this is exactly the same..)
    curr_pose = poses[:-1]
    next_pose = poses[1:]
    RRt = tf.matmul(curr_pose, next_pose, transpose_b=True)

    # For min (1-tr(RR_T)) / 2
    costheta = (tf.trace(RRt) - 1) / 2.
    target = tf.ones_like(costheta)
    if global_only:
        print('Pose smoothness increased on global!')
        weights_global = 10 * tf.expand_dims(tf.ones_like(costheta[:, 0]), 1)
        weights_joints = tf.ones_like(costheta[:, 1:])
        weights = tf.concat([weights_global, weights_joints], 1)
    else:
        weights = tf.ones_like(costheta)
    return tf.losses.mean_squared_error(target, costheta, weights=weights)


def joint_smoothness(joints):
    """
    joints: N x 19 x 3
    Computes smoothness of joints relative to root.
    """
    if joints.shape[1] == 19:
        left_hip, right_hip = 3, 2
        root = (joints[:, left_hip] + joints[:, right_hip]) / 2.
        root = tf.expand_dims(root, 1)

        joints = joints - root
    else:
        print('Unknown skeleton type')
        import ipdb; ipdb.set_trace()

    curr_joints = joints[:-1]
    next_joints = joints[1:]
    return tf.losses.mean_squared_error(curr_joints, next_joints)


def init_pose(pred_Rs, init_pose, weights=None):
    """
    Should stay close to initial weights
    pred_Rs is N x 24 x 3 x 3
    init_pose is 72D, need to conver to Rodrigues
    """
    init_Rs = batch_rodrigues(tf.reshape(init_pose, [-1, 3]))
    init_Rs = tf.reshape(init_Rs, [-1, 24, 3, 3])
    RRt = tf.matmul(init_Rs, pred_Rs, transpose_b=True)
    costheta = (tf.trace(RRt) - 1) / 2.
    target = tf.ones_like(costheta)
    if weights is None:
        weights = tf.ones_like(costheta)
    return tf.losses.mean_squared_error(target, costheta, weights=weights)


def camera_smoothness(cams, scale_factors, offsets, img_size=224):
    # cams: [s, tx, ty]

    scales = cams[:, 0]
    actual_scales = scales * (1./scale_factors)
    trans = cams[:, 1:]
    # pred trans + bbox top left corner / img_size
    actual_trans = ((trans + 1) * img_size * 0.5 + offsets) / img_size

    curr_scales = actual_scales[:-1]
    next_scales = actual_scales[1:]

    curr_trans = actual_trans[:-1]
    next_trans = actual_trans[1:]

    scale_diff = tf.losses.mean_squared_error(curr_scales, next_scales)
    trans_diff = tf.losses.mean_squared_error(curr_trans, next_trans)
    return scale_diff + trans_diff