python source code of resnet

tensor2tensor-master
- AUTHORS
- floyd.yml
- tensor2tensor
  - serving
    - export.py
    - query.py
    - __init__.py
    - README.md
    - serving_utils.py
  - utils
    - video2gif.py
    - mtf_model.py
    - multistep_with_adamoptimizer_test.py
    - avg_checkpoints.py
    - expert_utils_test.py
    - data_reader_test.py
    - test_utils.py
    - quantization.py
    - trainer_lib_test.py
    - mlperf_log.py
    - hparams_lib_test.py
    - hparam_test.py
    - usr_dir.py
    - get_rouge.py
    - sari_hook.py
    - metrics_test.py
    - learning_rate.py
    - devices.py
    - yellowfin_test.py
    - misc_utils_test.py
    - metrics.py
    - scheduled_sampling.py
    - cloud_mlengine.py
    - decoding.py
    - adv_attack_utils.py
    - sari_hook_test.py
    - test_utils_test.py
    - rouge.py
    - video_metrics.py
    - diet_test.py
    - flags.py
    - adafactor.py
    - get_ende_bleu.sh
    - diet.py
    - data_reader.py
    - rouge_test.py
    - registry.py
    - restore_hook.py
    - checkpoint_compatibility_test.py
    - hparams_lib.py
    - multistep_with_adamoptimizer.py
    - t2t_model.py
    - yellowfin.py
    - __init__.py
    - multistep_optimizer_test.py
    - registry_test.py
    - misc_utils.py
    - video_metrics_test.py
    - bleu_hook_test.py
    - mlperf_tags.py
    - optimize.py
    - partial_checkpoint_load_hook.py
    - hparam.py
    - expert_utils.py
    - pruning_utils.py
    - compute_video_metrics.py
    - optimize_test.py
    - t2t_model_test.py
    - metrics_hook_test.py
    - beam_search.py
    - contrib.py
    - beam_search_test.py
    - trainer_lib.py
    - update_ops_hook.py
    - get_cnndm_rouge.sh
    - metrics_hook.py
    - multistep_optimizer.py
    - video
      - reward_confusion.py
      - prediction2gif.py
    - bleu_hook.py
  - layers
    - common_video.py
    - discretization.py
    - latent_layers.py
    - transformer_glow_layers_ops_test.py
    - transformer_glow_layers.py
    - transformer_memory.py
    - discretization_test.py
    - transformer_glow_layers_test.py
    - transformer_memory_test.py
    - common_hparams.py
    - modalities_test.py
    - area_attention_test.py
    - message_passing_attention.py
    - latent_layers_test.py
    - transformer_glow_layers_ops.py
    - common_image_attention.py
    - common_layers.py
    - ngram.py
    - common_attention_test.py
    - common_image_attention_test.py
    - ngram_test.py
    - vqa_layers.py
    - common_video_test.py
    - __init__.py
    - vq_discrete.py
    - modalities.py
    - common_layers_test.py
    - area_attention.py
    - transformer_layers.py
    - common_audio.py
  - problems_colab.py
  - models
    - revnet_test.py
    - slicenet.py
    - mtf_image_transformer.py
    - distillation.py
    - neural_assistant.py
    - transformer.py
    - basic_test.py
    - slicenet_test.py
    - neural_gpu.py
    - basic.py
    - bytenet_test.py
    - image_transformer_2d.py
    - mtf_resnet.py
    - text_cnn.py
    - evolved_transformer.py
    - xception.py
    - neural_architecture_search
      - nas_model_test.py
      - nas_layers.py
      - __init__.py
      - README.md
      - nas_model.py
      - nas_layers_test.py
    - evolved_transformer_test.py
    - mtf_transformer_test.py
    - bytenet.py
    - image_transformer_2d_test.py
    - resnet.py
    - lstm_test.py
    - image_transformer.py
    - lstm.py
    - mtf_transformer2.py
    - mtf_image_transformer_test.py
    - xception_test.py
    - __init__.py
    - README.md
    - shake_shake.py
    - transformer_test.py
    - resnet_test.py
    - research
      - universal_transformer_util.py
      - rl.py
      - transformer_vae_flow_prior.py
      - moe_experiments.py
      - autoencoders.py
      - similarity_transformer.py
      - glow_test.py
      - transformer_moe.py
      - neural_stack_test.py
      - vqa_attention.py
      - transformer_aux.py
      - cycle_gan.py
      - multiquery_paper.py
      - autoencoders_test.py
      - gene_expression.py
      - glow_ops_test.py
      - attention_lm_moe.py
      - shuffle_network.py
      - transformer_vae_test.py
      - gene_expression_test.py
      - transformer_revnet.py
      - adafactor_experiments.py
      - transformer_nat.py
      - vqa_attention_test.py
      - glow_ops.py
      - transformer_revnet_test.py
      - transformer_vae.py
      - attention_lm.py
      - universal_transformer_test.py
      - moe.py
      - transformer_vae_flow_prior_ops.py
      - lm_experiments.py
      - transformer_symshard.py
      - super_lm.py
      - glow.py
      - glow_init_hook.py
      - transformer_sketch.py
      - __init__.py
      - residual_shuffle_exchange.py
      - universal_transformer.py
      - vqa_recurrent_self_attention.py
      - neural_stack.py
      - vqa_self_attention.py
      - aligned.py
      - transformer_parallel.py
      - transformer_aux_test.py
    - neural_gpu_test.py
    - revnet.py
    - image_transformer_test.py
    - vanilla_gan.py
    - mtf_transformer.py
    - video
      - nfg_conv3d_test.py
      - sv2p_test.py
      - savp.py
      - epva_params.py
      - emily.py
      - base_vae.py
      - nfg_conv_test.py
      - nfg_uncond_test.py
      - next_frame_glow.py
      - sv2p_params.py
      - basic_deterministic_test.py
      - emily_test.py
      - nfg_test_utils.py
      - basic_stochastic.py
      - basic_deterministic_params.py
      - basic_recurrent.py
      - sv2p.py
      - savp_params.py
      - nfg_conv_lstm_test.py
      - basic_recurrent_test.py
      - epva.py
      - savp_test.py
      - basic_stochastic_test.py
      - __init__.py
      - nfg_interpolate.py
      - basic_deterministic.py
      - base.py
      - tests_utils.py
  - rl
    - gym_utils_test.py
    - gym_utils.py
    - evaluator.py
    - trainer_model_free_tictactoe_test.py
    - evaluator_test.py
    - batch_runner_test.py
    - dopamine_connector.py
    - ppo.py
    - trainer_model_free.py
    - restarter.py
    - restarter_test.py
    - rl_utils.py
    - trainer_model_based.py
    - trainer_model_based_agent_only.py
    - batch_dqn_agent_test.py
    - trainer_model_based_params.py
    - trainer_model_free_test.py
    - player.py
    - trainer_model_based_recurrent_test.py
    - __init__.py
    - README.md
    - datagen_with_agent.py
    - player_utils.py
    - ppo_learner.py
    - trainer_model_based_sv2p_test.py
    - trainer_model_based_stochastic_test.py
    - policy_learner.py
    - trainer_model_based_test.py
    - envs
      - py_func_batch_env.py
      - simulated_batch_gym_env.py
      - tf_atari_wrappers.py
      - in_graph_batch_env.py
      - __init__.py
      - simulated_batch_env.py
  - test_data
    - example_usr_dir
      - my_submodule.py
      - __init__.py
      - requirements.txt
    - vocab.translate_ende_wmt8k.8192.subwords
    - transformer_test_ckpt
      - flags.txt
      - model.ckpt-1.index
      - hparams.json
      - checkpoint
      - model.ckpt-1.data-00000-of-00002
  - notebooks
    - asr_transformer.ipynb
    - t2t_problem.ipynb
    - Transformer_translate.ipynb
  - data_generators
    - wiki_lm.py
    - translate_test.py
    - translate_enmk.py
    - audio_encoder.py
    - snli.py
    - translate_enzh.py
    - mscoco_test.py
    - speech_recognition.py
    - conll_ner.py
    - audio_test.py
    - imdb.py
    - gym_env_test.py
    - algorithmic_math_deepmind.py
    - moving_mnist.py
    - timeseries_data_generator.py
    - desc2code.py
    - dialog_opensubtitles.py
    - function_docstring.py
    - wsj_parsing.py
    - tokenizer.py
    - cipher.py
    - gene_expression.py
    - allen_brain.py
    - wiki_multi_problems.py
    - wikitext103.py
    - mnist.py
    - celeba.py
    - librispeech.py
    - translate_ende.py
    - timeseries.py
    - translate_entn.py
    - gene_expression_test.py
    - ptb.py
    - problem_hparams.py
    - paraphrase_ms_coco_test.py
    - dna_encoder_test.py
    - dialog_abstract.py
    - translate_enid.py
    - text_encoder_test.py
    - translate_enes.py
    - lm1b.py
    - generator_utils.py
    - dna_encoder.py
    - cnn_dailymail.py
    - wnli.py
    - transduction_problems.py
    - style_transfer_test.py
    - fsns.py
    - mscoco.py
    - celeba_test.py
    - imagenet_test.py
    - babi_qa.py
    - desc2code_test.py
    - video_utils_test.py
    - vqa.py
    - text_encoder_build_subword.py
    - wiki_revision_utils.py
    - algorithmic_test.py
    - multi_problem_v2.py
    - tokenizer_test.py
    - video_generated.py
    - program_search.py
    - image_utils.py
    - mrpc.py
    - transduction_problems_test.py
    - multi_problem_v2_test.py
    - translate_enro.py
    - audio.py
    - ice_parsing.py
    - image_lsun.py
    - cleaner_en_xx.py
    - algorithmic_math.py
    - scitail.py
    - style_transfer.py
    - test_data
      - vocab-1.txt
      - vocab-2.txt
      - 1.csv
      - corpus-1.txt
      - corpus-2.txt
    - multi_problem.py
    - algorithmic.py
    - rte.py
    - allen_brain_test.py
    - lm1b_mnli.py
    - video_utils.py
    - dialog_personachat.py
    - wikifact
      - README.md
    - paraphrase_ms_coco.py
    - ocr.py
    - problem.py
    - dialog_cornell.py
    - wiki_revision.py
    - stanford_nli.py
    - subject_verb_agreement.py
    - gym_env.py
    - timeseries_data_generator_test.py
    - bair_robot_pushing.py
    - sst_binary.py
    - image_utils_test.py
    - text_problems.py
    - pointer_generator_word.py
    - translate_enet.py
    - generator_utils_test.py
    - __init__.py
    - text_problems_test.py
    - common_voice.py
    - README.md
    - google_robot_pushing.py
    - translate_ende_test.py
    - yelp_polarity.py
    - timeseries_test.py
    - text_encoder.py
    - program_search_test.py
    - ops
      - subword_text_encoder.cc
      - subword_text_encoder_ops.cc
      - testdata
        subwords
      - subword_text_encoder_test.cc
      - subword_text_encoder.h
      - pack_sequences_ops.cc
      - pack_sequences_ops_test.py
      - subword_text_encoder_ops_test.py
    - wikisum
      - delete_instances.sh
      - html.py
      - validate_data.py
      - wikisum.py
      - utils_test.py
      - parallel_launch.py
      - get_references_web_single_group.py
      - test_data
        para_good1.txt
        para_bad1.txt
      - get_references_commoncrawl.py
      - get_references_web.py
      - produce_examples.py
      - __init__.py
      - README.md
      - utils.py
      - generate_vocab.py
    - translate_envi.py
    - multinli.py
    - common_voice_test.py
    - problem_test.py
    - all_problems.py
    - algorithmic_math_test.py
    - celebahq.py
    - vqa_utils.py
    - translate.py
    - lm1b_imdb.py
    - dialog_dailydialog.py
    - inspect_tfrecord.py
    - cola.py
    - enwik8.py
    - imagenet.py
    - cifar.py
    - yelp_full.py
    - algorithmic_math_two_variables.py
    - lambada.py
    - wiki.py
    - qnli.py
    - translate_enfr.py
    - quora_qpairs.py
    - translate_encs.py
    - squad.py
  - __init__.py
  - visualization
    - visualization_test.py
    - attention.py
    - TransformerVisualization.ipynb
    - attention.js
    - __init__.py
    - visualization.py
  - bin
    - t2t-eval
    - t2t-trainer
    - t2t_trainer.py
    - t2t_translate_all.py
    - t2t-translate-all
    - t2t_prune.py
    - t2t_datagen.py
    - t2t-decoder
    - t2t-make-tf-configs
    - t2t-avg-all
    - build_vocab.py
    - make_tf_configs.py
    - t2t_avg_all.py
    - t2t_trainer_test.py
    - t2t_distill.py
    - t2t_attack.py
    - t2t-exporter
    - t2t-bleu
    - __init__.py
    - t2t_eval.py
    - t2t-insights-server
    - t2t_bleu.py
    - t2t_decoder.py
    - t2t-datagen
    - t2t-query-server
  - metrics
    - video_conditional_fvd_test.py
    - video_conditional_fvd.py
    - __init__.py
  - problems_test.py
  - problems.py
  - insights
    - query_processor.py
    - graph.py
    - insight_configuration.proto
    - server.py
    - polymer
      - explore_view
        explore-view.js
        explore-view.html
      - query_card
        query-card.js
        query-card.html
      - insights_app
        insights-app.html
        insights-app.js
      - language_selector
        language-selector-content.html
        language-selector-content.js
        language-selector.html
        language-selector.js
      - attention_visualization
        attention-visualization.js
        attention-visualization.html
      - processing_visualization
        processing-visualization.js
        processing-visualization.html
      - bower.json
      - translation_result
        translation-result.html
        translation-result.js
      - common-types.js
      - graph_visualization
        graph-visualization.html
        graph-visualization.js
      - .bowerrc
      - tensor2tensor.html
      - index.html
    - __init__.py
    - README.md
    - transformer_model.py
  - envs
    - time_step_test.py
    - mujoco_problems.py
    - env_problem_utils.py
    - trajectory.py
    - gym_env_problem.py
    - gym_env_problem_test.py
    - tic_tac_toe_env_test.py
    - tic_tac_toe_env_problem.py
    - time_step.py
    - trajectory_test.py
    - env_problem.py
    - tic_tac_toe_env.py
    - gym_spaces_utils_test.py
    - env_problem_utils_test.py
    - tic_tac_toe_env_problem_test.py
    - __init__.py
    - mujoco_problems_test.py
    - rendered_env_problem_test.py
    - rendered_env_problem.py
    - gym_spaces_utils.py
- LICENSE
- CONTRIBUTING.md
- ISSUE_TEMPLATE.md
- setup.py
- .travis.yml
- README.md
- floyd_requirements.txt
- pylintrc
- .gitignore
- docs
  - multi_problem.md
  - new_model.md
  - walkthrough.md
  - cloud_mlengine.md
  - index.md
  - overview.md
  - tutorials
    - asr_with_transformer.md
  - distributed_training.md
  - new_problem.md
  - cloud_tpu.md
- oss_scripts
  - oss_integration_test.sh
  - oss_pip_install.sh
  - oss_tests.sh
  - oss_release.sh

# coding=utf-8
# Copyright 2020 The Tensor2Tensor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Resnets."""
# Copied from cloud_tpu/models/resnet/resnet_model.py and modified

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tensor2tensor.layers import common_hparams
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import hparam
from tensor2tensor.utils import registry
from tensor2tensor.utils import t2t_model

import tensorflow.compat.v1 as tf


BATCH_NORM_DECAY = 0.9
BATCH_NORM_EPSILON = 1e-5


# TODO(lukaszkaiser): remove or simplify after V2 work is done.
def layers():
  return common_layers.layers()


def batch_norm_relu(inputs,
                    is_training,
                    relu=True,
                    init_zero=False,
                    data_format="channels_first"):
  """Performs a batch normalization followed by a ReLU.

  Args:
    inputs: `Tensor` of shape `[batch, channels, ...]`.
    is_training: `bool` for whether the model is training.
    relu: `bool` if False, omits the ReLU operation.
    init_zero: `bool` if True, initializes scale parameter of batch
        normalization with 0 instead of 1 (default).
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.

  Returns:
    A normalized `Tensor` with the same `data_format`.
  """
  if init_zero:
    gamma_initializer = tf.zeros_initializer()
  else:
    gamma_initializer = tf.ones_initializer()

  if data_format == "channels_first":
    axis = 1
  else:
    axis = 3

  inputs = layers().BatchNormalization(
      axis=axis,
      momentum=BATCH_NORM_DECAY,
      epsilon=BATCH_NORM_EPSILON,
      center=True,
      scale=True,
      fused=True,
      gamma_initializer=gamma_initializer)(inputs, training=is_training)

  if relu:
    inputs = tf.nn.relu(inputs)
  return inputs


def fixed_padding(inputs, kernel_size, data_format="channels_first"):
  """Pads the input along the spatial dimensions independently of input size.

  Args:
    inputs: `Tensor` of size `[batch, channels, height, width]` or
        `[batch, height, width, channels]` depending on `data_format`.
    kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
        operations. Should be a positive integer.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.

  Returns:
    A padded `Tensor` of the same `data_format` with size either intact
    (if `kernel_size == 1`) or padded (if `kernel_size > 1`).
  """
  pad_total = kernel_size - 1
  pad_beg = pad_total // 2
  pad_end = pad_total - pad_beg
  if data_format == "channels_first":
    padded_inputs = tf.pad(
        inputs, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
  else:
    padded_inputs = tf.pad(
        inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])

  return padded_inputs


def conv2d_fixed_padding(inputs,
                         filters,
                         kernel_size,
                         strides,
                         data_format="channels_first",
                         use_td=False,
                         targeting_rate=None,
                         keep_prob=None,
                         is_training=None):
  """Strided 2-D convolution with explicit padding.

  The padding is consistent and is based only on `kernel_size`, not on the
  dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).

  Args:
    inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
    filters: `int` number of filters in the convolution.
    kernel_size: `int` size of the kernel to be used in the convolution.
    strides: `int` strides of the convolution.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.
    use_td: `str` one of "weight" or "unit". Set to False or "" to disable
      targeted dropout.
    targeting_rate: `float` proportion of weights to target with targeted
      dropout.
    keep_prob: `float` keep probability for targeted dropout.
    is_training: `bool` for whether the model is in training.

  Returns:
    A `Tensor` of shape `[batch, filters, height_out, width_out]`.

  Raises:
    Exception: if use_td is not valid.
  """
  if strides > 1:
    inputs = fixed_padding(inputs, kernel_size, data_format=data_format)

  if use_td:
    inputs_shape = common_layers.shape_list(inputs)
    if use_td == "weight":
      if data_format == "channels_last":
        size = kernel_size * kernel_size * inputs_shape[-1]
      else:
        size = kernel_size * kernel_size * inputs_shape[1]
      targeting_count = targeting_rate * tf.to_float(size)
      targeting_fn = common_layers.weight_targeting
    elif use_td == "unit":
      targeting_count = targeting_rate * filters
      targeting_fn = common_layers.unit_targeting
    else:
      raise Exception("Unrecognized targeted dropout type: %s" % use_td)

    y = common_layers.td_conv(
        inputs,
        filters,
        kernel_size,
        targeting_count,
        targeting_fn,
        keep_prob,
        is_training,
        do_prune=True,
        strides=strides,
        padding=("SAME" if strides == 1 else "VALID"),
        data_format=data_format,
        use_bias=False,
        kernel_initializer=tf.variance_scaling_initializer())
  else:
    y = layers().Conv2D(
        filters=filters,
        kernel_size=kernel_size,
        strides=strides,
        padding=("SAME" if strides == 1 else "VALID"),
        use_bias=False,
        kernel_initializer=tf.variance_scaling_initializer(),
        data_format=data_format)(inputs)

  return y


def residual_block(inputs,
                   filters,
                   is_training,
                   projection_shortcut,
                   strides,
                   final_block,
                   data_format="channels_first",
                   use_td=False,
                   targeting_rate=None,
                   keep_prob=None,
                   bottleneck_ratio=None):
  """Standard building block for residual networks with BN before convolutions.

  Args:
    inputs: `Tensor` of size `[batch, channels, height, width]`.
    filters: `int` number of filters for the first two convolutions. Note that
        the third and final convolution will use 4 times as many filters.
    is_training: `bool` for whether the model is in training.
    projection_shortcut: `function` to use for projection shortcuts (typically
        a 1x1 convolution to match the filter dimensions). If None, no
        projection is used and the input is passed as unchanged through the
        shortcut connection.
    strides: `int` block stride. If greater than 1, this block will ultimately
        downsample the input.
    final_block: unused parameter to keep the same function signature as
        `bottleneck_block`.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.
    use_td: `str` one of "weight" or "unit". Set to False or "" to disable
      targeted dropout.
    targeting_rate: `float` proportion of weights to target with targeted
      dropout.
    keep_prob: `float` keep probability for targeted dropout.
    bottleneck_ratio: unused parameter to keep the same function signature as
        `bottleneck_block`.

  Returns:
    The output `Tensor` of the block.
  """
  del final_block
  del bottleneck_ratio
  shortcut = inputs
  inputs = batch_norm_relu(inputs, is_training, data_format=data_format)

  if projection_shortcut is not None:
    shortcut = projection_shortcut(inputs)

  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=filters,
      kernel_size=3,
      strides=strides,
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      is_training=is_training)

  inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=filters,
      kernel_size=3,
      strides=1,
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      is_training=is_training)

  return inputs + shortcut


def bottleneck_block(inputs,
                     filters,
                     is_training,
                     projection_shortcut,
                     strides,
                     final_block,
                     data_format="channels_first",
                     use_td=False,
                     targeting_rate=None,
                     keep_prob=None,
                     bottleneck_ratio=4):
  """Bottleneck block variant for residual networks with BN after convolutions.

  Args:
    inputs: `Tensor` of size `[batch, channels, height, width]`.
    filters: `int` number of filters for the first two convolutions. Note that
        the third and final convolution will use 4 times as many filters.
    is_training: `bool` for whether the model is in training.
    projection_shortcut: `function` to use for projection shortcuts (typically
        a 1x1 convolution to match the filter dimensions). If None, no
        projection is used and the input is passed as unchanged through the
        shortcut connection.
    strides: `int` block stride. If greater than 1, this block will ultimately
        downsample the input.
    final_block: `bool` set to True if it is this the final block in the group.
        This is changes the behavior of batch normalization initialization for
        the final batch norm in a block.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.
    use_td: `str` one of "weight" or "unit". Set to False or "" to disable
      targeted dropout.
    targeting_rate: `float` proportion of weights to target with targeted
      dropout.
    keep_prob: `float` keep probability for targeted dropout.
    bottleneck_ratio: `int`, how much we scale up filters.


  Returns:
    The output `Tensor` of the block.
  """
  # TODO(chrisying): this block is technically the post-activation resnet-v1
  # bottleneck unit. Test with v2 (pre-activation) and replace if there is no
  # difference for consistency.
  shortcut = inputs
  if projection_shortcut is not None:
    shortcut = projection_shortcut(inputs)

  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=filters,
      kernel_size=1,
      strides=1,
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      is_training=is_training)

  inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=filters,
      kernel_size=3,
      strides=strides,
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      is_training=is_training)

  inputs = batch_norm_relu(inputs, is_training, data_format=data_format)
  inputs = conv2d_fixed_padding(
      inputs=inputs,
      filters=bottleneck_ratio * filters,
      kernel_size=1,
      strides=1,
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      is_training=is_training)
  inputs = batch_norm_relu(
      inputs,
      is_training,
      relu=False,
      init_zero=final_block,
      data_format=data_format)

  return tf.nn.relu(inputs + shortcut)


def block_layer(inputs,
                filters,
                block_fn,
                blocks,
                strides,
                is_training,
                name,
                data_format="channels_first",
                use_td=False,
                targeting_rate=None,
                keep_prob=None,
                bottleneck_ratio=4):
  """Creates one layer of blocks for the ResNet model.

  Args:
    inputs: `Tensor` of size `[batch, channels, height, width]`.
    filters: `int` number of filters for the first convolution of the layer.
    block_fn: `function` for the block to use within the model
    blocks: `int` number of blocks contained in the layer.
    strides: `int` stride to use for the first convolution of the layer. If
        greater than 1, this layer will downsample the input.
    is_training: `bool` for whether the model is training.
    name: `str`name for the Tensor output of the block layer.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.
    use_td: `str` one of "weight" or "unit". Set to False or "" to disable
      targeted dropout.
    targeting_rate: `float` proportion of weights to target with targeted
      dropout.
    keep_prob: `float` keep probability for targeted dropout.
    bottleneck_ratio: `int`, how much we scale up filters in bottleneck block.

  Returns:
    The output `Tensor` of the block layer.
  """
  # Bottleneck blocks end with bottleneck_ratio x the number of filters
  filters_out = filters
  if block_fn is bottleneck_block:
    filters_out = bottleneck_ratio * filters

  def projection_shortcut(inputs):
    """Project identity branch."""
    inputs = conv2d_fixed_padding(
        inputs=inputs,
        filters=filters_out,
        kernel_size=1,
        strides=strides,
        data_format=data_format,
        use_td=use_td,
        targeting_rate=targeting_rate,
        keep_prob=keep_prob,
        is_training=is_training)
    return batch_norm_relu(
        inputs, is_training, relu=False, data_format=data_format)

  # Only the first block per block_layer uses projection_shortcut and strides
  inputs = block_fn(
      inputs,
      filters,
      is_training,
      projection_shortcut,
      strides,
      False,
      data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      bottleneck_ratio=bottleneck_ratio)

  for i in range(1, blocks):
    inputs = block_fn(
        inputs,
        filters,
        is_training,
        None,
        1, (i + 1 == blocks),
        data_format,
        use_td=use_td,
        targeting_rate=targeting_rate,
        keep_prob=keep_prob,
        bottleneck_ratio=bottleneck_ratio)

  return tf.identity(inputs, name)


def resnet_v2(inputs,
              block_fn,
              layer_blocks,
              filters,
              data_format="channels_first",
              is_training=False,
              is_cifar=False,
              use_td=False,
              targeting_rate=None,
              keep_prob=None,
              bottleneck_ratios=None):
  """Resnet model.

  Args:
    inputs: `Tensor` images.
    block_fn: `function` for the block to use within the model. Either
        `residual_block` or `bottleneck_block`.
    layer_blocks: list of 3 or 4 `int`s denoting the number of blocks to include
      in each of the 3 or 4 block groups. Each group consists of blocks that
      take inputs of the same resolution.
    filters: list of 4 or 5 `int`s denoting the number of filter to include in
      block.
    data_format: `str`, "channels_first" `[batch, channels, height,
        width]` or "channels_last" `[batch, height, width, channels]`.
    is_training: bool, build in training mode or not.
    is_cifar: bool, whether the data is CIFAR or not.
    use_td: `str` one of "weight" or "unit". Set to False or "" to disable
      targeted dropout.
    targeting_rate: `float` proportion of weights to target with targeted
      dropout.
    keep_prob: `float` keep probability for targeted dropout.
    bottleneck_ratios: list of `int`s, how much we scale up filters in
      bottleneck blocks.

  Returns:
    Pre-logit activations.
  """
  inputs = block_layer(
      inputs=inputs,
      filters=filters[1],
      block_fn=block_fn,
      blocks=layer_blocks[0],
      strides=1,
      is_training=is_training,
      name="block_layer1",
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      bottleneck_ratio=bottleneck_ratios[0])
  inputs = block_layer(
      inputs=inputs,
      filters=filters[2],
      block_fn=block_fn,
      blocks=layer_blocks[1],
      strides=2,
      is_training=is_training,
      name="block_layer2",
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      bottleneck_ratio=bottleneck_ratios[1])
  inputs = block_layer(
      inputs=inputs,
      filters=filters[3],
      block_fn=block_fn,
      blocks=layer_blocks[2],
      strides=2,
      is_training=is_training,
      name="block_layer3",
      data_format=data_format,
      use_td=use_td,
      targeting_rate=targeting_rate,
      keep_prob=keep_prob,
      bottleneck_ratio=bottleneck_ratios[2])
  if not is_cifar:
    inputs = block_layer(
        inputs=inputs,
        filters=filters[4],
        block_fn=block_fn,
        blocks=layer_blocks[3],
        strides=2,
        is_training=is_training,
        name="block_layer4",
        data_format=data_format,
        use_td=use_td,
        targeting_rate=targeting_rate,
        keep_prob=keep_prob,
        bottleneck_ratio=bottleneck_ratios[3])

  return inputs


@registry.register_model
class Resnet(t2t_model.T2TModel):
  """Residual Network."""

  def body(self, features):
    hp = self.hparams
    block_fns = {
        "residual": residual_block,
        "bottleneck": bottleneck_block,
    }
    assert hp.block_fn in block_fns
    is_training = hp.mode == tf.estimator.ModeKeys.TRAIN
    if is_training:
      targets = features["targets_raw"]

    inputs = features["inputs"]

    data_format = "channels_last"
    if hp.use_nchw:
      # Convert from channels_last (NHWC) to channels_first (NCHW). This
      # provides a large performance boost on GPU.
      inputs = tf.transpose(inputs, [0, 3, 1, 2])
      data_format = "channels_first"

    inputs = conv2d_fixed_padding(
        inputs=inputs,
        filters=hp.filter_sizes[0],
        kernel_size=7,
        strides=1 if hp.is_cifar else 2,
        data_format=data_format)
    inputs = tf.identity(inputs, "initial_conv")
    inputs = batch_norm_relu(inputs, is_training, data_format=data_format)

    if not hp.is_cifar:
      inputs = layers().MaxPooling2D(
          pool_size=3,
          strides=2,
          padding="SAME",
          data_format=data_format)(inputs)
      inputs = tf.identity(inputs, "initial_max_pool")

    out = resnet_v2(
        inputs,
        block_fns[hp.block_fn],
        hp.layer_sizes,
        hp.filter_sizes,
        data_format,
        is_training=is_training,
        is_cifar=hp.is_cifar,
        use_td=hp.use_td,
        targeting_rate=hp.targeting_rate,
        keep_prob=hp.keep_prob,
        bottleneck_ratios=hp.bottleneck_ratios)

    if hp.use_nchw:
      out = tf.transpose(out, [0, 2, 3, 1])

    if not hp.is_cifar:
      return out

    out = tf.reduce_mean(out, [1, 2])
    num_classes = self._problem_hparams.vocab_size["targets"]
    if hasattr(self._hparams, "vocab_divisor"):
      num_classes += (-num_classes) % self._hparams.vocab_divisor
    logits = layers().Dense(num_classes, name="logits")(out)

    losses = {"training": 0.0}
    if is_training:
      loss = tf.losses.sparse_softmax_cross_entropy(
          labels=tf.squeeze(targets), logits=logits)
      loss = tf.reduce_mean(loss)

      losses = {"training": loss}

    logits = tf.reshape(logits, [-1, 1, 1, 1, logits.shape[1]])

    return logits, losses

  def infer(self,
            features=None,
            decode_length=50,
            beam_size=1,
            top_beams=1,
            alpha=0.0,
            use_tpu=False):
    """Predict."""
    del decode_length, beam_size, top_beams, alpha, use_tpu
    assert features is not None
    logits, _ = self(features)  # pylint: disable=not-callable
    assert len(logits.get_shape()) == 5
    logits = tf.squeeze(logits, [1, 2, 3])
    log_probs = common_layers.log_prob_from_logits(logits)
    predictions, scores = common_layers.argmax_with_score(log_probs)
    return {
        "outputs": predictions,
        "scores": scores,
    }


def resnet_base():
  """Set of hyperparameters."""
  # For imagenet on TPU:
  # Set train_steps=120000
  # Set eval_steps=48

  # Base
  hparams = common_hparams.basic_params1()

  # Model-specific parameters
  hparams.add_hparam("layer_sizes", [3, 4, 6, 3])
  hparams.add_hparam("bottleneck_ratios", [4, 4, 4, 4])
  hparams.add_hparam("filter_sizes", [64, 64, 128, 256, 512])
  hparams.add_hparam("block_fn", "bottleneck")
  hparams.add_hparam("use_nchw", True)
  hparams.add_hparam("is_cifar", False)

  # Targeted dropout
  hparams.add_hparam("use_td", False)
  hparams.add_hparam("targeting_rate", None)
  hparams.add_hparam("keep_prob", None)

  # Variable init
  hparams.initializer = "normal_unit_scaling"
  hparams.initializer_gain = 2.

  # Optimization
  hparams.optimizer = "Momentum"
  hparams.optimizer_momentum_momentum = 0.9
  hparams.optimizer_momentum_nesterov = True
  hparams.weight_decay = 1e-4
  hparams.clip_grad_norm = 0.0
  # (base_lr=0.1) * (batch_size=128*8 (on TPU, or 8 GPUs)=1024) / (256.)
  hparams.learning_rate = 0.4
  hparams.learning_rate_decay_scheme = "cosine"
  # For image_imagenet224, 120k training steps, which effectively makes this a
  # cosine decay (i.e. no cycles).
  hparams.learning_rate_cosine_cycle_steps = 120000

  hparams.batch_size = 128
  return hparams


@registry.register_hparams
def resnet_50():
  hp = resnet_base()
  return hp


@registry.register_hparams
def resnet_18():
  hp = resnet_base()
  hp.block_fn = "residual"
  hp.layer_sizes = [2, 2, 2, 2]
  return hp


@registry.register_hparams
def resnet_imagenet_34():
  """Set of hyperparameters."""
  hp = resnet_base()
  hp.block_fn = "residual"
  hp.layer_sizes = [2, 4, 8, 2]

  return hp


@registry.register_hparams
def resnet_imagenet_34_td_weight_05_05():
  """Set of hyperparameters."""
  hp = resnet_imagenet_34()
  hp.use_td = "weight"
  hp.targeting_rate = 0.5
  hp.keep_prob = 0.5

  return hp


@registry.register_hparams
def resnet_imagenet_34_td_unit_05_05():
  """Set of hyperparameters."""
  hp = resnet_imagenet_34()
  hp.use_td = "unit"
  hp.targeting_rate = 0.5
  hp.keep_prob = 0.5

  return hp


@registry.register_hparams
def resnet_imagenet_34_td_unit_no_drop():
  """Set of hyperparameters."""
  hp = resnet_imagenet_34()
  hp.use_td = "unit"
  hp.targeting_rate = 0.0
  hp.keep_prob = 1.0

  return hp


@registry.register_hparams
def resnet_imagenet_102():
  hp = resnet_imagenet_34()
  hp.layer_sizes = [3, 8, 36, 3]
  return hp


@registry.register_hparams
def resnet_cifar_15():
  """Set of hyperparameters."""
  hp = resnet_base()
  hp.block_fn = "residual"
  hp.is_cifar = True
  hp.layer_sizes = [2, 2, 2]
  hp.filter_sizes = [16, 32, 64, 128]

  return hp


@registry.register_hparams
def resnet_cifar_32():
  hp = resnet_cifar_15()
  hp.layer_sizes = [5, 5, 5]
  return hp


@registry.register_hparams
def resnet_cifar_32_td_weight_05_05():
  hp = resnet_cifar_32()
  hp.use_td = "weight"
  hp.targeting_rate = 0.5
  hp.keep_prob = 0.5
  return hp


@registry.register_hparams
def resnet_cifar_32_td_unit_05_05():
  hp = resnet_cifar_32()
  hp.use_td = "unit"
  hp.targeting_rate = 0.5
  hp.keep_prob = 0.5
  return hp


@registry.register_hparams
def resnet_cifar_32_td_unit_no_drop():
  hp = resnet_cifar_32()
  hp.use_td = "unit"
  hp.targeting_rate = 0.0
  hp.keep_prob = 1.0
  return hp


@registry.register_hparams
def resnet_34():
  hp = resnet_base()
  hp.block_fn = "residual"
  return hp


@registry.register_hparams
def resnet_101():
  hp = resnet_base()
  hp.layer_sizes = [3, 4, 23, 3]
  return hp


@registry.register_hparams
def resnet_152():
  hp = resnet_base()
  hp.layer_sizes = [3, 8, 36, 3]
  return hp


@registry.register_hparams
def resnet_200():
  hp = resnet_base()
  hp.layer_sizes = [3, 24, 36, 3]
  return hp


# Pruning parameters
@registry.register_pruning_params
def resnet_weight():
  hp = hparam.HParams()
  hp.add_hparam("strategy", "weight")
  hp.add_hparam("black_list", ["logits", "bias"])
  hp.add_hparam("white_list", ["td_conv"])
  hp.add_hparam("sparsities", [0.1 * i for i in range(10)])
  return hp


@registry.register_pruning_params
def resnet_unit():
  hp = resnet_weight()
  hp.strategy = "unit"
  return hp


# Adversarial attack parameters
@registry.register_attack_params
def resnet_fgsm():
  aparams = hparam.HParams()
  aparams.attack = "fgsm"
  aparams.epsilon_name = "eps"
  aparams.attack_epsilons = [i * 0.8 for i in range(20)]
  aparams.add_hparam("clip_min", 0.0)
  aparams.add_hparam("clip_max", 255.0)
  return aparams


@registry.register_attack_params
def resnet_madry():
  aparams = resnet_fgsm()
  aparams.attack = "madry"
  aparams.add_hparam("nb_iter", 40)
  aparams.add_hparam("eps_iter", 1.0)
  return aparams


@registry.register_attack_params
def resnet_random():
  aparams = resnet_fgsm()
  aparams.attack = "random"
  aparams.epsilon_name = "eps"
  aparams.add_hparam("num_samples", 10)
  aparams.add_hparam("num_batches", 100)
  return aparams