python source code of models

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from tensorflow.contrib.layers.python.layers.initializers import (
    variance_scaling_initializer,
)
import tensorflow.contrib.slim as slim


def get_image_encoder(model_type='resnet'):
    """
    Retrieves encoder fn for image and 3D
    """
    models = {
        'resnet': encoder_resnet
    }
    if model_type in models.keys():
        return models[model_type]
    else:
        print('Unknown image encoder:', model_type)
        exit(1)


def get_hallucinator_model(model_type='fc2_res'):
    models = {
        'fc2_res': fc2_res,
    }
    if model_type in models.keys():
        return models[model_type]
    else:
        print('Unknown predict hal model:', model_type)
        exit(1)


def get_temporal_encoder(model_type='AZ_FC2GN'):
    models = {
        'AZ_FC2GN': az_fc2_groupnorm,
    }
    if model_type in models.keys():
        return models[model_type]
    else:
        print('Unknown temporal encoder:', model_type)
        exit(1)


# Functions for image encoder.

def encoder_resnet(x, is_training=True, weight_decay=0.001, reuse=False):
    """
    Resnet v2-50
    Assumes input is [batch, height_in, width_in, channels]!!
    Input:
    - x: N x H x W x 3
    - weight_decay: float
    - reuse: bool->True if test

    Outputs:
    - cam: N x 3
    - Pose vector: N x 72
    - Shape vector: N x 10
    - variables: tf variables
    """
    from tensorflow.contrib.slim.python.slim.nets import resnet_v2
    with tf.name_scope('Encoder_resnet', [x]):
        with slim.arg_scope(
                resnet_v2.resnet_arg_scope(weight_decay=weight_decay)):
            net, end_points = resnet_v2.resnet_v2_50(
                x,
                num_classes=None,
                is_training=is_training,
                reuse=reuse,
                scope='resnet_v2_50')
            net = tf.squeeze(net, axis=[1, 2])
    variables_scope = 'resnet_v2_50'
    return net, variables_scope


def encoder_fc3_dropout(x,
                        num_output=85,
                        is_training=True,
                        reuse=False,
                        name='3D_module'):
    """
    3D inference module. 3 MLP layers (last is the output)
    With dropout  on first 2.
    Input:
    - x: N x [|img_feat|, |3D_param|]
    - reuse: bool

    Outputs:
    - 3D params: N x num_output
      if orthogonal:
           either 85: (3 + 24*3 + 10) or 109 (3 + 24*4 + 10) for factored
           axis-angle representation
      if perspective:
          86: (f, tx, ty, tz) + 24*3 + 10, or 110 for factored axis-angle.
    - variables: tf variables
    """
    with tf.variable_scope(name, reuse=reuse) as scope:
        net = slim.fully_connected(x, 1024, scope='fc1')
        net = slim.dropout(net, 0.5, is_training=is_training, scope='dropout1')
        net = slim.fully_connected(net, 1024, scope='fc2')
        net = slim.dropout(net, 0.5, is_training=is_training, scope='dropout2')
        small_xavier = variance_scaling_initializer(
            factor=.01, mode='FAN_AVG', uniform=True)
        net = slim.fully_connected(
            net,
            num_output,
            activation_fn=None,
            weights_initializer=small_xavier,
            scope='fc3')

    variables = tf.contrib.framework.get_variables(scope)
    return net, variables


# Functions for f_{movie strip}.

def az_fc2_groupnorm(is_training, net, num_conv_layers):
    """
    Each block has 2 convs.
    So:
    norm --> relu --> conv --> norm --> relu --> conv --> add.
    Uses full convolution.


    Args:

    """
    for i in range(num_conv_layers):
        net = az_fc_block2(
            net_input=net,
            num_filter=2048,
            kernel_width=3,
            is_training=is_training,
            use_groupnorm=True,
            name='block_{}'.format(i),
        )
    return net


def az_fc_block2(net_input, num_filter, kernel_width, is_training,
                 use_groupnorm=False, name=None):
    """
    Full convolutions not separable!!
    same as az_fc_block, but residual connections is proper Kaiming style
    of BN -> Relu -> Weight -> BN -> Relu -> Weight -> Add
    """
    # NTC -> NT1C
    net_input_expand = tf.expand_dims(net_input, axis=2)
    if use_groupnorm:
        # group-norm
        net_norm = tf.contrib.layers.group_norm(
            net_input_expand,
            channels_axis=-1,
            reduction_axes=(-3, -2),
            scope='AZ_FC_block_preact_gn1' + name,
            reuse=None,
        )
    else:
        # batchnorm
        net_norm = tf.contrib.layers.batch_norm(
            net_input_expand,
            scope='AZ_FC_block_preact_bn1' + name,
            reuse=None,
            is_training=is_training,
        )
    # relu
    net_relu = tf.nn.relu(net_norm)
    # weight
    net_conv1 = tf.contrib.layers.conv2d(
        inputs=net_relu,
        num_outputs=num_filter,
        kernel_size=[kernel_width, 1],
        stride=1,
        padding='SAME',
        data_format='NHWC',  # was previously 'NCHW',
        rate=1,
        activation_fn=None,
        scope='AZ_FC_block2_conv1' + name,
        reuse=None,
    )
    # norm
    if use_groupnorm:
        # group-norm
        net_norm2 = tf.contrib.layers.group_norm(
            net_conv1,
            channels_axis=-1,
            reduction_axes=(-3, -2),
            scope='AZ_FC_block_preact_gn2' + name,
            reuse=None,
        )
    else:
        net_norm2 = tf.contrib.layers.batch_norm(
            net_conv1,
            scope='AZ_FC_block_preact_bn2' + name,
            reuse=None,
            is_training=is_training,
        )

    # relu
    net_relu2 = tf.nn.relu(net_norm2)
    # weight
    small_xavier = variance_scaling_initializer(
        factor=.001, mode='FAN_AVG', uniform=True)

    net_final = tf.contrib.layers.conv2d(
        inputs=net_relu2,
        num_outputs=num_filter,
        kernel_size=[kernel_width, 1],
        stride=1,
        padding='SAME',
        data_format='NHWC',
        rate=1,
        activation_fn=None,
        weights_initializer=small_xavier,
        scope='AZ_FC_block2_conv2' + name,
        reuse=None,
    )

    # NT1C -> NTC
    net_final = tf.squeeze(net_final, axis=2)
    # skip connection
    residual = tf.add(net_final, net_input)

    return residual


# Functions for f_3D.

def batch_pred_omega(input_features, batch_size, is_training, num_output,
                     omega_mean, sequence_length, scope, predict_delta_keys=(),
                     use_delta_from_pred=False, use_optcam=False):
    """
    Given B x T x * inputs, computes IEF on them by batching them
    as BT x *.

    if use_optcam is True, only outputs 72 or 82 dims.
    and appends fixed camera [1,0,0]
    """
    # run in batch
    # omega_mean comes in as shape: BT x 85
    input_features_reshape = tf.reshape(input_features,
                                        (batch_size * sequence_length, -1))
    omega_pred, delta_predictions = call_hmr_ief(
        phi=input_features_reshape,
        omega_start=omega_mean,
        scope=scope,
        num_output=num_output,
        is_training=is_training,
        predict_delta_keys=predict_delta_keys,
        use_delta_from_pred=use_delta_from_pred,
        use_optcam=use_optcam,
    )
    omega_pred = tf.reshape(
        omega_pred,
        (batch_size, sequence_length, num_output)
    )
    new_delta_predictions = {}
    for delta_t, prediction in delta_predictions.items():
        new_delta_predictions[delta_t] = tf.reshape(
            prediction,
            (batch_size, sequence_length, num_output)
        )
    return omega_pred, new_delta_predictions


def fc2_res(phi, name='fc2_res'):
    """
    Converts pretrained (fixed) resnet features phi into movie strip.

    This applies 2 fc then add it to the orig as residuals.

    Args:
        phi (B x T x 2048): Image feature.
        name (str): Scope.

    Returns:
        Phi (B x T x 2048): Hallucinated movie strip.
    """
    with tf.variable_scope(name, reuse=False):
        net = slim.fully_connected(phi, 2048, scope='fc1')
        net = slim.fully_connected(net, 2048, scope='fc2')
        small_xavier = variance_scaling_initializer(
            factor=.001, mode='FAN_AVG', uniform=True)
        net_final = slim.fully_connected(
            net,
            2048,
            activation_fn=None,
            weights_initializer=small_xavier,
            scope='fc3'
        )
        new_phi = net_final + phi
    return new_phi


def call_hmr_ief(phi, omega_start, scope, num_output=85, num_stage=3,
                 is_training=True, predict_delta_keys=(),
                 use_delta_from_pred=False, use_optcam=True):
    """
    Wrapper for doing HMR-style IEF.

    If predict_delta, then also makes num_delta_t predictions forward and
    backward in time, with each step of delta_t.

    Args:
        phi (Bx2048): Image features.
        omega_start (Bx85): Starting Omega as input to first IEF.
        scope (str): Name of scope for reuse.
        num_output (int): Size of output.
        num_stage (int): Number of iterations for IEF.
        is_training (bool): If False, don't apply dropout.
        predict_delta_keys (iterable): List of keys for delta_t.
        use_delta_from_pred (bool): If True, initializes delta prediction from
            current frame prediction.
        use_optcam (bool): If True, uses [1, 0, 0] for cam.

    Returns:
        Final theta (Bx{num_output})
        Deltas predictions (List of outputs)
    """
    theta_here = hmr_ief(
        phi=phi,
        omega_start=omega_start,
        scope=scope,
        num_output=num_output,
        num_stage=num_stage,
        is_training=is_training
    )

    # Delta only needs to do cam/pose, no shape!
    if use_optcam:
        num_output_delta = 72
    else:
        num_output_delta = 3 + 72

    deltas_predictions = {}
    for delta_t in predict_delta_keys:
        if delta_t == 0:
            # This should just be the normal IEF.
            continue
        elif delta_t > 0:
            scope_delta = scope + '_future{}'.format(delta_t)
        elif delta_t < 0:
            scope_delta = scope + '_past{}'.format(abs(delta_t))

        omega_start_delta = theta_here if use_delta_from_pred else omega_start
        # append this later.
        beta = omega_start_delta[:, -10:]

        if use_optcam:
            # trim the first 3D camera + last shpae
            omega_start_delta = omega_start_delta[:, 3:3 + num_output_delta]
        else:
            omega_start_delta = omega_start_delta[:, :num_output_delta]

        delta_pred = hmr_ief(
            phi=phi,
            omega_start=omega_start_delta,
            scope=scope_delta,
            num_output=num_output_delta,
            num_stage=num_stage,
            is_training=is_training
        )
        if use_optcam:
            # Add camera + shape
            scale = tf.ones([delta_pred.shape[0], 1])
            trans = tf.zeros([delta_pred.shape[0], 2])
            delta_pred = tf.concat([scale, trans, delta_pred, beta], 1)
        else:
            delta_pred = tf.concat([delta_pred[:, :75], beta], 1)

        deltas_predictions[delta_t] = delta_pred

    return theta_here, deltas_predictions


def hmr_ief(phi, omega_start, scope, num_output=85, num_stage=3,
            is_training=True):
    """
    Runs HMR-style IEF.

    Args:
        phi (Bx2048): Image features.
        omega_start (Bx85): Starting Omega as input to first IEF.
        scope (str): Name of scope for reuse.
        num_output (int): Size of output.
        num_stage (int): Number of iterations for IEF.
        is_training (bool): If False, don't apply dropout.

    Returns:
        Final theta (Bx{num_output})
    """
    with tf.variable_scope(scope):
        theta_prev = omega_start
        theta_here = None

        for _ in range(num_stage):
            # ---- Compute outputs
            state = tf.concat([phi, theta_prev], 1)
            delta_theta, _ = encoder_fc3_dropout(
                state,
                is_training=is_training,
                num_output=num_output,
                reuse=tf.AUTO_REUSE
            )
            # Compute new theta
            theta_here = theta_prev + delta_theta

            # Finally update to end iteration.
            theta_prev = theta_here

    return theta_here