python source code of inception_resnet

"""Xception V1 model for Keras.
On ImageNet, this model gets to a top-1 validation accuracy of 0.790
and a top-5 validation accuracy of 0.945.
Do note that the input image format for this model is different than for
the VGG16 and ResNet models (299x299 instead of 224x224),
and that the input preprocessing function
is also different (same as Inception V3).
# Reference
- [Xception: Deep Learning with Depthwise Separable Convolutions](
    https://arxiv.org/abs/1610.02357)
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from keras.layers import Input, Dense, Activation, Flatten, Conv2D, MaxPooling2D, BatchNormalization, GlobalAveragePooling2D, AveragePooling2D, TimeDistributed, Concatenate, Lambda

from keras import backend as K

from keras_frcnn.RoiPoolingConv import RoiPoolingConv
from keras_frcnn.FixedBatchNormalization import FixedBatchNormalization


def get_weight_path():
    return os.path.join('keras_frcnn', 'weights','inception_resnet_v2.h5')


def get_img_output_length(width, height):
    def get_output_length(input_length):
        # filter_sizes = [3, 3, 3, 3, 3, 3, 3]
        # strides = [2, 1, 2, 1, 2, 2, 2]
        filter_sizes = [3, 3, 3, 3, 3, 3]
        strides = [2, 1, 2, 1, 2, 2]

        assert len(filter_sizes) == len(strides)

        for i in range(len(filter_sizes)):
            input_length = (input_length - filter_sizes[i]) // strides[i] + 1

        return input_length

    return get_output_length(width), get_output_length(height)


def conv2d_bn(x,
              filters,
              kernel_size,
              strides=1,
              padding='same',
              activation='relu',
              use_bias=False,
              name=None):
    """Utility function to apply conv + BN.
    # Arguments
        x: input tensor.
        filters: filters in `Conv2D`.
        kernel_size: kernel size as in `Conv2D`.
        strides: strides in `Conv2D`.
        padding: padding mode in `Conv2D`.
        activation: activation in `Conv2D`.
        use_bias: whether to use a bias in `Conv2D`.
        name: name of the ops; will become `name + '_ac'` for the activation
            and `name + '_bn'` for the batch norm layer.
    # Returns
        Output tensor after applying `Conv2D` and `BatchNormalization`.
    """
    x = Conv2D(filters,
               kernel_size,
               strides=strides,
               padding=padding,
               use_bias=use_bias,
               name=name)(x)
    if not use_bias:
        bn_axis = 1 if K.image_data_format() == 'channels_first' else 3
        bn_name = None if name is None else name + '_bn'
        x = BatchNormalization(axis=bn_axis,
                               scale=False,
                               name=bn_name)(x)
    if activation is not None:
        ac_name = None if name is None else name + '_ac'
        x = Activation(activation, name=ac_name)(x)
    return x


def conv2d_bn_td(x,
                 filters,
                 kernel_size,
                 strides=1,
                 padding='same',
                 activation='relu',
                 use_bias=False,
                 name=None):
    """Utility function to apply conv + BN.
    # Arguments
        x: input tensor.
        filters: filters in `Conv2D`.
        kernel_size: kernel size as in `Conv2D`.
        strides: strides in `Conv2D`.
        padding: padding mode in `Conv2D`.
        activation: activation in `Conv2D`.
        use_bias: whether to use a bias in `Conv2D`.
        name: name of the ops; will become `name + '_ac'` for the activation
            and `name + '_bn'` for the batch norm layer.
    # Returns
        Output tensor after applying `Conv2D` and `BatchNormalization`.
    """
    x = TimeDistributed(Conv2D(filters,
                               kernel_size,
                               strides=strides,
                               padding=padding,
                               use_bias=use_bias),
                        name=name)(x)
    if not use_bias:
        bn_axis = 1 if K.image_data_format() == 'channels_first' else 3
        bn_name = None if name is None else name + '_bn'
        x = TimeDistributed(BatchNormalization(axis=bn_axis,
                                               scale=False),
                            name=bn_name)(x)
    if activation is not None:
        ac_name = None if name is None else name + '_ac'
        x = Activation(activation, name=ac_name)(x)
    return x


def inception_resnet_block(x, scale, block_type, block_idx, activation='relu'):
    """Adds a Inception-ResNet block.
    This function builds 3 types of Inception-ResNet blocks mentioned
    in the paper, controlled by the `block_type` argument (which is the
    block name used in the official TF-slim implementation):
        - Inception-ResNet-A: `block_type='block35'`
        - Inception-ResNet-B: `block_type='block17'`
        - Inception-ResNet-C: `block_type='block8'`
    # Arguments
        x: input tensor.
        scale: scaling factor to scale the residuals (i.e., the output of
            passing `x` through an inception module) before adding them
            to the shortcut branch.
            Let `r` be the output from the residual branch,
            the output of this block will be `x + scale * r`.
        block_type: `'block35'`, `'block17'` or `'block8'`, determines
            the network structure in the residual branch.
        block_idx: an `int` used for generating layer names.
            The Inception-ResNet blocks
            are repeated many times in this network.
            We use `block_idx` to identify
            each of the repetitions. For example,
            the first Inception-ResNet-A block
            will have `block_type='block35', block_idx=0`,
            and the layer names will have
            a common prefix `'block35_0'`.
        activation: activation function to use at the end of the block
            (see [activations](../activations.md)).
            When `activation=None`, no activation is applied
            (i.e., "linear" activation: `a(x) = x`).
    # Returns
        Output tensor for the block.
    # Raises
        ValueError: if `block_type` is not one of `'block35'`,
            `'block17'` or `'block8'`.
    """
    block_name = block_type + '_' + str(block_idx)

    if block_type == 'block35':
        branch_0 = conv2d_bn(x, 32, 1, name=block_name + '_conv1')
        branch_1 = conv2d_bn(x, 32, 1, name=block_name + '_conv2')
        branch_1 = conv2d_bn(branch_1, 32, 3, name=block_name + '_conv3')
        branch_2 = conv2d_bn(x, 32, 1, name=block_name + '_conv4')
        branch_2 = conv2d_bn(branch_2, 48, 3, name=block_name + '_conv5')
        branch_2 = conv2d_bn(branch_2, 64, 3, name=block_name + '_conv6')
        branches = [branch_0, branch_1, branch_2]
    elif block_type == 'block17':
        branch_0 = conv2d_bn(x, 192, 1, name=block_name + '_conv1')
        branch_1 = conv2d_bn(x, 128, 1, name=block_name + '_conv2')
        branch_1 = conv2d_bn(branch_1, 160, [1, 7], name=block_name + '_conv3')
        branch_1 = conv2d_bn(branch_1, 192, [7, 1], name=block_name + '_conv4')
        branches = [branch_0, branch_1]
    elif block_type == 'block8':
        branch_0 = conv2d_bn(x, 192, 1, name=block_name + '_conv1')
        branch_1 = conv2d_bn(x, 192, 1, name=block_name + '_conv2')
        branch_1 = conv2d_bn(branch_1, 224, [1, 3], name=block_name + '_conv3')
        branch_1 = conv2d_bn(branch_1, 256, [3, 1], name=block_name + '_conv4')
        branches = [branch_0, branch_1]
    else:
        raise ValueError('Unknown Inception-ResNet block type. '
                         'Expects "block35", "block17" or "block8", '
                         'but got: ' + str(block_type))

    channel_axis = 1 if K.image_data_format() == 'channels_first' else 3
    mixed = Concatenate(
        axis=channel_axis, name=block_name + '_mixed')(branches)
    up = conv2d_bn(mixed,
                   K.int_shape(x)[channel_axis],
                   1,
                   activation=None,
                   use_bias=True,
                   name=block_name + '_conv')

    x = Lambda(lambda inputs, scale: inputs[0] + inputs[1] * scale,
                 output_shape=K.int_shape(x)[1:],
                 arguments={'scale': scale},
                 name=block_name)([x, up])
    if activation is not None:
        x = Activation(activation, name=block_name + '_ac')(x)
    return x


def inception_resnet_block_td(x, scale, block_type, block_idx, activation='relu'):
    """Adds a Inception-ResNet block.
    This function builds 3 types of Inception-ResNet blocks mentioned
    in the paper, controlled by the `block_type` argument (which is the
    block name used in the official TF-slim implementation):
        - Inception-ResNet-A: `block_type='block35'`
        - Inception-ResNet-B: `block_type='block17'`
        - Inception-ResNet-C: `block_type='block8'`
    # Arguments
        x: input tensor.
        scale: scaling factor to scale the residuals (i.e., the output of
            passing `x` through an inception module) before adding them
            to the shortcut branch.
            Let `r` be the output from the residual branch,
            the output of this block will be `x + scale * r`.
        block_type: `'block35'`, `'block17'` or `'block8'`, determines
            the network structure in the residual branch.
        block_idx: an `int` used for generating layer names.
            The Inception-ResNet blocks
            are repeated many times in this network.
            We use `block_idx` to identify
            each of the repetitions. For example,
            the first Inception-ResNet-A block
            will have `block_type='block35', block_idx=0`,
            and the layer names will have
            a common prefix `'block35_0'`.
        activation: activation function to use at the end of the block
            (see [activations](../activations.md)).
            When `activation=None`, no activation is applied
            (i.e., "linear" activation: `a(x) = x`).
    # Returns
        Output tensor for the block.
    # Raises
        ValueError: if `block_type` is not one of `'block35'`,
            `'block17'` or `'block8'`.
    """
    block_name = block_type + '_' + str(block_idx)

    if block_type == 'block35':
        branch_0 = conv2d_bn_td(x, 32, 1, name=block_name + '_conv1')
        branch_1 = conv2d_bn_td(x, 32, 1, name=block_name + '_conv2')
        branch_1 = conv2d_bn_td(branch_1, 32, 3, name=block_name + '_conv3')
        branch_2 = conv2d_bn_td(x, 32, 1, name=block_name + '_conv4')
        branch_2 = conv2d_bn_td(branch_2, 48, 3, name=block_name + '_conv5')
        branch_2 = conv2d_bn_td(branch_2, 64, 3, name=block_name + '_conv6')
        branches = [branch_0, branch_1, branch_2]
    elif block_type == 'block17':
        branch_0 = conv2d_bn_td(x, 192, 1, name=block_name + '_conv1')
        branch_1 = conv2d_bn_td(x, 128, 1, name=block_name + '_conv2')
        branch_1 = conv2d_bn_td(branch_1, 160, [1, 7], name=block_name + '_conv3')
        branch_1 = conv2d_bn_td(branch_1, 192, [7, 1], name=block_name + '_conv4')
        branches = [branch_0, branch_1]
    elif block_type == 'block8':
        branch_0 = conv2d_bn_td(x, 192, 1, name=block_name + '_conv1')
        branch_1 = conv2d_bn_td(x, 192, 1, name=block_name + '_conv2')
        branch_1 = conv2d_bn_td(branch_1, 224, [1, 3], name=block_name + '_conv3')
        branch_1 = conv2d_bn_td(branch_1, 256, [3, 1], name=block_name + '_conv4')
        branches = [branch_0, branch_1]
    else:
        raise ValueError('Unknown Inception-ResNet block type. '
                         'Expects "block35", "block17" or "block8", '
                         'but got: ' + str(block_type))

    channel_axis = 1 if K.image_data_format() == 'channels_first' else 4
    mixed = Concatenate(
        axis=channel_axis, name=block_name + '_mixed')(branches)
    up = conv2d_bn_td(mixed,
                      K.int_shape(x)[channel_axis],
                      1,
                      activation=None,
                      use_bias=True,
                      name=block_name + '_conv')

    x = Lambda(lambda inputs, scale: inputs[0] + inputs[1] * scale,
               output_shape=K.int_shape(x)[1:],
               arguments={'scale': scale},
               name=block_name)([x, up])
    if activation is not None:
        x = Activation(activation, name=block_name + '_ac')(x)
    return x


def nn_base(input_tensor=None, trainable=False):

    # Determine proper input shape
    if K.image_dim_ordering() == 'th':
        input_shape = (3, None, None)
    else:
        input_shape = (None, None, 3)

    if input_tensor is None:
        img_input = Input(shape=input_shape)
    else:
        if not K.is_keras_tensor(input_tensor):
            img_input = Input(tensor=input_tensor, shape=input_shape)
        else:
            img_input = input_tensor

    if K.image_dim_ordering() == 'tf':
        bn_axis = 3
    else:
        bn_axis = 1

    # Stem block: 35 x 35 x 192
    x = conv2d_bn(img_input, 32, 3, strides=2, padding='valid', name='Stem_block' + '_conv1')
    x = conv2d_bn(x, 32, 3, padding='valid', name='Stem_block' + '_conv2')
    x = conv2d_bn(x, 64, 3, name='Stem_block' + '_conv3')
    x = MaxPooling2D(3, strides=2)(x)
    x = conv2d_bn(x, 80, 1, padding='valid', name='Stem_block' + '_conv4')
    x = conv2d_bn(x, 192, 3, padding='valid', name='Stem_block' + '_conv5')
    x = MaxPooling2D(3, strides=2)(x)

    # Mixed 5b (Inception-A block): 35 x 35 x 320
    branch_0 = conv2d_bn(x, 96, 1, name='Inception_A_block' + '_conv1')
    branch_1 = conv2d_bn(x, 48, 1, name='Inception_A_block' + '_conv2')
    branch_1 = conv2d_bn(branch_1, 64, 5, name='Inception_A_block' + '_conv3')
    branch_2 = conv2d_bn(x, 64, 1, name='Inception_A_block' + '_conv4')
    branch_2 = conv2d_bn(branch_2, 96, 3, name='Inception_A_block' + '_conv5')
    branch_2 = conv2d_bn(branch_2, 96, 3, name='Inception_A_block' + '_conv6')
    branch_pool = AveragePooling2D(3, strides=1, padding='same')(x)
    branch_pool = conv2d_bn(branch_pool, 64, 1, name='Inception_A_block' + '_conv7')
    branches = [branch_0, branch_1, branch_2, branch_pool]
    channel_axis = 1 if K.image_data_format() == 'channels_first' else 3
    x = Concatenate(axis=channel_axis, name='mixed_5b')(branches)

    # 10x block35 (Inception-ResNet-A block): 35 x 35 x 320
    for block_idx in range(1, 11):
        x = inception_resnet_block(x,
                                   scale=0.17,
                                   block_type='block35',
                                   block_idx=block_idx)

    # Mixed 6a (Reduction-A block): 17 x 17 x 1088
    branch_0 = conv2d_bn(x, 384, 3, strides=2, padding='valid', name='Reduction_A_block' + '_conv1')
    branch_1 = conv2d_bn(x, 256, 1, name='Reduction_A_block' + '_conv2')
    branch_1 = conv2d_bn(branch_1, 256, 3, name='Reduction_A_block' + '_conv3')
    branch_1 = conv2d_bn(branch_1, 384, 3, strides=2, padding='valid', name='Reduction_A_block' + '_conv4')
    branch_pool = MaxPooling2D(3, strides=2, padding='valid')(x)
    branches = [branch_0, branch_1, branch_pool]
    x = Concatenate(axis=channel_axis, name='mixed_6a')(branches)

    # 20x block17 (Inception-ResNet-B block): 17 x 17 x 1088
    for block_idx in range(1, 21):
        x = inception_resnet_block(x,
                                   scale=0.1,
                                   block_type='block17',
                                   block_idx=block_idx)

    return x


def classifier_layers(x, input_shape, trainable=False):

    # compile times on theano tend to be very high, so we use smaller ROI pooling regions to workaround
    # (hence a smaller stride in the region that follows the ROI pool)

    channel_axis = 1 if K.image_data_format() == 'channels_first' else 4

    # Mixed 7a (Reduction-B block): 8 x 8 x 2080
    branch_0 = conv2d_bn_td(x, 256, 1, name='Reduction_B_block' + '_conv1')
    branch_0 = conv2d_bn_td(branch_0, 384, 3, strides=2, padding='valid', name='Reduction_B_block' + '_conv2')
    branch_1 = conv2d_bn_td(x, 256, 1, name='Reduction_B_block' + '_conv3')
    branch_1 = conv2d_bn_td(branch_1, 288, 3, strides=2, padding='valid', name='Reduction_B_block' + '_conv4')
    branch_2 = conv2d_bn_td(x, 256, 1, name='Reduction_B_block' + '_conv5')
    branch_2 = conv2d_bn_td(branch_2, 288, 3, name='Reduction_B_block' + '_conv6')
    branch_2 = conv2d_bn_td(branch_2, 320, 3, strides=2, padding='valid', name='Reduction_B_block' + '_conv7')
    branch_pool = TimeDistributed(MaxPooling2D(3, strides=2, padding='valid'))(x)
    branches = [branch_0, branch_1, branch_2, branch_pool]
    x = Concatenate(axis=channel_axis, name='mixed_7a')(branches)

    # 10x block8 (Inception-ResNet-C block): 8 x 8 x 2080
    for block_idx in range(1, 10):
        x = inception_resnet_block_td(x,
                                      scale=0.2,
                                      block_type='block8',
                                      block_idx=block_idx)
    x = inception_resnet_block_td(x,
                                  scale=1.,
                                  activation=None,
                                  block_type='block8',
                                  block_idx=10)

    # Final convolution block: 8 x 8 x 1536
    x = conv2d_bn_td(x, 1536, 1, name='conv_7b')

    TimeDistributed(GlobalAveragePooling2D(), name='avg_pool')(x)

    return x


def rpn(base_layers, num_anchors):

    x = Conv2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)

    x_class = Conv2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x)
    x_regr = Conv2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x)

    return [x_class, x_regr, base_layers]


def classifier(base_layers, input_rois, num_rois, nb_classes=21, trainable=False):

    # compile times on theano tend to be very high, so we use smaller ROI pooling regions to workaround

    if K.backend() == 'tensorflow':
        pooling_regions = 14
        # Changed the input shape to 1088 from 1024 because of nn_base's output being 1088. Not sure if this is correct
        input_shape = (num_rois, 14, 14, 1088)
    elif K.backend() == 'theano':
        pooling_regions = 7
        input_shape = (num_rois, 1024, 7, 7)

    out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layers, input_rois])
    out = classifier_layers(out_roi_pool, input_shape=input_shape, trainable=True)

    out = TimeDistributed(Flatten())(out)

    out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out)
    # note: no regression target for bg class
    out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out)
    return [out_class, out_regr]