python source code of area_attention

tensor2tensor-master
- AUTHORS
- floyd.yml
- tensor2tensor
  - serving
    - export.py
    - query.py
    - __init__.py
    - README.md
    - serving_utils.py
  - utils
    - video2gif.py
    - mtf_model.py
    - multistep_with_adamoptimizer_test.py
    - avg_checkpoints.py
    - expert_utils_test.py
    - data_reader_test.py
    - test_utils.py
    - quantization.py
    - trainer_lib_test.py
    - mlperf_log.py
    - hparams_lib_test.py
    - hparam_test.py
    - usr_dir.py
    - get_rouge.py
    - sari_hook.py
    - metrics_test.py
    - learning_rate.py
    - devices.py
    - yellowfin_test.py
    - misc_utils_test.py
    - metrics.py
    - scheduled_sampling.py
    - cloud_mlengine.py
    - decoding.py
    - adv_attack_utils.py
    - sari_hook_test.py
    - test_utils_test.py
    - rouge.py
    - video_metrics.py
    - diet_test.py
    - flags.py
    - adafactor.py
    - get_ende_bleu.sh
    - diet.py
    - data_reader.py
    - rouge_test.py
    - registry.py
    - restore_hook.py
    - checkpoint_compatibility_test.py
    - hparams_lib.py
    - multistep_with_adamoptimizer.py
    - t2t_model.py
    - yellowfin.py
    - __init__.py
    - multistep_optimizer_test.py
    - registry_test.py
    - misc_utils.py
    - video_metrics_test.py
    - bleu_hook_test.py
    - mlperf_tags.py
    - optimize.py
    - partial_checkpoint_load_hook.py
    - hparam.py
    - expert_utils.py
    - pruning_utils.py
    - compute_video_metrics.py
    - optimize_test.py
    - t2t_model_test.py
    - metrics_hook_test.py
    - beam_search.py
    - contrib.py
    - beam_search_test.py
    - trainer_lib.py
    - update_ops_hook.py
    - get_cnndm_rouge.sh
    - metrics_hook.py
    - multistep_optimizer.py
    - video
      - reward_confusion.py
      - prediction2gif.py
    - bleu_hook.py
  - layers
    - common_video.py
    - discretization.py
    - latent_layers.py
    - transformer_glow_layers_ops_test.py
    - transformer_glow_layers.py
    - transformer_memory.py
    - discretization_test.py
    - transformer_glow_layers_test.py
    - transformer_memory_test.py
    - common_hparams.py
    - modalities_test.py
    - area_attention_test.py
    - message_passing_attention.py
    - latent_layers_test.py
    - transformer_glow_layers_ops.py
    - common_image_attention.py
    - common_layers.py
    - ngram.py
    - common_attention_test.py
    - common_image_attention_test.py
    - ngram_test.py
    - vqa_layers.py
    - common_video_test.py
    - __init__.py
    - vq_discrete.py
    - modalities.py
    - common_layers_test.py
    - area_attention.py
    - transformer_layers.py
    - common_audio.py
  - problems_colab.py
  - models
    - revnet_test.py
    - slicenet.py
    - mtf_image_transformer.py
    - distillation.py
    - neural_assistant.py
    - transformer.py
    - basic_test.py
    - slicenet_test.py
    - neural_gpu.py
    - basic.py
    - bytenet_test.py
    - image_transformer_2d.py
    - mtf_resnet.py
    - text_cnn.py
    - evolved_transformer.py
    - xception.py
    - neural_architecture_search
      - nas_model_test.py
      - nas_layers.py
      - __init__.py
      - README.md
      - nas_model.py
      - nas_layers_test.py
    - evolved_transformer_test.py
    - mtf_transformer_test.py
    - bytenet.py
    - image_transformer_2d_test.py
    - resnet.py
    - lstm_test.py
    - image_transformer.py
    - lstm.py
    - mtf_transformer2.py
    - mtf_image_transformer_test.py
    - xception_test.py
    - __init__.py
    - README.md
    - shake_shake.py
    - transformer_test.py
    - resnet_test.py
    - research
      - universal_transformer_util.py
      - rl.py
      - transformer_vae_flow_prior.py
      - moe_experiments.py
      - autoencoders.py
      - similarity_transformer.py
      - glow_test.py
      - transformer_moe.py
      - neural_stack_test.py
      - vqa_attention.py
      - transformer_aux.py
      - cycle_gan.py
      - multiquery_paper.py
      - autoencoders_test.py
      - gene_expression.py
      - glow_ops_test.py
      - attention_lm_moe.py
      - shuffle_network.py
      - transformer_vae_test.py
      - gene_expression_test.py
      - transformer_revnet.py
      - adafactor_experiments.py
      - transformer_nat.py
      - vqa_attention_test.py
      - glow_ops.py
      - transformer_revnet_test.py
      - transformer_vae.py
      - attention_lm.py
      - universal_transformer_test.py
      - moe.py
      - transformer_vae_flow_prior_ops.py
      - lm_experiments.py
      - transformer_symshard.py
      - super_lm.py
      - glow.py
      - glow_init_hook.py
      - transformer_sketch.py
      - __init__.py
      - residual_shuffle_exchange.py
      - universal_transformer.py
      - vqa_recurrent_self_attention.py
      - neural_stack.py
      - vqa_self_attention.py
      - aligned.py
      - transformer_parallel.py
      - transformer_aux_test.py
    - neural_gpu_test.py
    - revnet.py
    - image_transformer_test.py
    - vanilla_gan.py
    - mtf_transformer.py
    - video
      - nfg_conv3d_test.py
      - sv2p_test.py
      - savp.py
      - epva_params.py
      - emily.py
      - base_vae.py
      - nfg_conv_test.py
      - nfg_uncond_test.py
      - next_frame_glow.py
      - sv2p_params.py
      - basic_deterministic_test.py
      - emily_test.py
      - nfg_test_utils.py
      - basic_stochastic.py
      - basic_deterministic_params.py
      - basic_recurrent.py
      - sv2p.py
      - savp_params.py
      - nfg_conv_lstm_test.py
      - basic_recurrent_test.py
      - epva.py
      - savp_test.py
      - basic_stochastic_test.py
      - __init__.py
      - nfg_interpolate.py
      - basic_deterministic.py
      - base.py
      - tests_utils.py
  - rl
    - gym_utils_test.py
    - gym_utils.py
    - evaluator.py
    - trainer_model_free_tictactoe_test.py
    - evaluator_test.py
    - batch_runner_test.py
    - dopamine_connector.py
    - ppo.py
    - trainer_model_free.py
    - restarter.py
    - restarter_test.py
    - rl_utils.py
    - trainer_model_based.py
    - trainer_model_based_agent_only.py
    - batch_dqn_agent_test.py
    - trainer_model_based_params.py
    - trainer_model_free_test.py
    - player.py
    - trainer_model_based_recurrent_test.py
    - __init__.py
    - README.md
    - datagen_with_agent.py
    - player_utils.py
    - ppo_learner.py
    - trainer_model_based_sv2p_test.py
    - trainer_model_based_stochastic_test.py
    - policy_learner.py
    - trainer_model_based_test.py
    - envs
      - py_func_batch_env.py
      - simulated_batch_gym_env.py
      - tf_atari_wrappers.py
      - in_graph_batch_env.py
      - __init__.py
      - simulated_batch_env.py
  - test_data
    - example_usr_dir
      - my_submodule.py
      - __init__.py
      - requirements.txt
    - vocab.translate_ende_wmt8k.8192.subwords
    - transformer_test_ckpt
      - flags.txt
      - model.ckpt-1.index
      - hparams.json
      - checkpoint
      - model.ckpt-1.data-00000-of-00002
  - notebooks
    - asr_transformer.ipynb
    - t2t_problem.ipynb
    - Transformer_translate.ipynb
  - data_generators
    - wiki_lm.py
    - translate_test.py
    - translate_enmk.py
    - audio_encoder.py
    - snli.py
    - translate_enzh.py
    - mscoco_test.py
    - speech_recognition.py
    - conll_ner.py
    - audio_test.py
    - imdb.py
    - gym_env_test.py
    - algorithmic_math_deepmind.py
    - moving_mnist.py
    - timeseries_data_generator.py
    - desc2code.py
    - dialog_opensubtitles.py
    - function_docstring.py
    - wsj_parsing.py
    - tokenizer.py
    - cipher.py
    - gene_expression.py
    - allen_brain.py
    - wiki_multi_problems.py
    - wikitext103.py
    - mnist.py
    - celeba.py
    - librispeech.py
    - translate_ende.py
    - timeseries.py
    - translate_entn.py
    - gene_expression_test.py
    - ptb.py
    - problem_hparams.py
    - paraphrase_ms_coco_test.py
    - dna_encoder_test.py
    - dialog_abstract.py
    - translate_enid.py
    - text_encoder_test.py
    - translate_enes.py
    - lm1b.py
    - generator_utils.py
    - dna_encoder.py
    - cnn_dailymail.py
    - wnli.py
    - transduction_problems.py
    - style_transfer_test.py
    - fsns.py
    - mscoco.py
    - celeba_test.py
    - imagenet_test.py
    - babi_qa.py
    - desc2code_test.py
    - video_utils_test.py
    - vqa.py
    - text_encoder_build_subword.py
    - wiki_revision_utils.py
    - algorithmic_test.py
    - multi_problem_v2.py
    - tokenizer_test.py
    - video_generated.py
    - program_search.py
    - image_utils.py
    - mrpc.py
    - transduction_problems_test.py
    - multi_problem_v2_test.py
    - translate_enro.py
    - audio.py
    - ice_parsing.py
    - image_lsun.py
    - cleaner_en_xx.py
    - algorithmic_math.py
    - scitail.py
    - style_transfer.py
    - test_data
      - vocab-1.txt
      - vocab-2.txt
      - 1.csv
      - corpus-1.txt
      - corpus-2.txt
    - multi_problem.py
    - algorithmic.py
    - rte.py
    - allen_brain_test.py
    - lm1b_mnli.py
    - video_utils.py
    - dialog_personachat.py
    - wikifact
      - README.md
    - paraphrase_ms_coco.py
    - ocr.py
    - problem.py
    - dialog_cornell.py
    - wiki_revision.py
    - stanford_nli.py
    - subject_verb_agreement.py
    - gym_env.py
    - timeseries_data_generator_test.py
    - bair_robot_pushing.py
    - sst_binary.py
    - image_utils_test.py
    - text_problems.py
    - pointer_generator_word.py
    - translate_enet.py
    - generator_utils_test.py
    - __init__.py
    - text_problems_test.py
    - common_voice.py
    - README.md
    - google_robot_pushing.py
    - translate_ende_test.py
    - yelp_polarity.py
    - timeseries_test.py
    - text_encoder.py
    - program_search_test.py
    - ops
      - subword_text_encoder.cc
      - subword_text_encoder_ops.cc
      - testdata
        subwords
      - subword_text_encoder_test.cc
      - subword_text_encoder.h
      - pack_sequences_ops.cc
      - pack_sequences_ops_test.py
      - subword_text_encoder_ops_test.py
    - wikisum
      - delete_instances.sh
      - html.py
      - validate_data.py
      - wikisum.py
      - utils_test.py
      - parallel_launch.py
      - get_references_web_single_group.py
      - test_data
        para_good1.txt
        para_bad1.txt
      - get_references_commoncrawl.py
      - get_references_web.py
      - produce_examples.py
      - __init__.py
      - README.md
      - utils.py
      - generate_vocab.py
    - translate_envi.py
    - multinli.py
    - common_voice_test.py
    - problem_test.py
    - all_problems.py
    - algorithmic_math_test.py
    - celebahq.py
    - vqa_utils.py
    - translate.py
    - lm1b_imdb.py
    - dialog_dailydialog.py
    - inspect_tfrecord.py
    - cola.py
    - enwik8.py
    - imagenet.py
    - cifar.py
    - yelp_full.py
    - algorithmic_math_two_variables.py
    - lambada.py
    - wiki.py
    - qnli.py
    - translate_enfr.py
    - quora_qpairs.py
    - translate_encs.py
    - squad.py
  - __init__.py
  - visualization
    - visualization_test.py
    - attention.py
    - TransformerVisualization.ipynb
    - attention.js
    - __init__.py
    - visualization.py
  - bin
    - t2t-eval
    - t2t-trainer
    - t2t_trainer.py
    - t2t_translate_all.py
    - t2t-translate-all
    - t2t_prune.py
    - t2t_datagen.py
    - t2t-decoder
    - t2t-make-tf-configs
    - t2t-avg-all
    - build_vocab.py
    - make_tf_configs.py
    - t2t_avg_all.py
    - t2t_trainer_test.py
    - t2t_distill.py
    - t2t_attack.py
    - t2t-exporter
    - t2t-bleu
    - __init__.py
    - t2t_eval.py
    - t2t-insights-server
    - t2t_bleu.py
    - t2t_decoder.py
    - t2t-datagen
    - t2t-query-server
  - metrics
    - video_conditional_fvd_test.py
    - video_conditional_fvd.py
    - __init__.py
  - problems_test.py
  - problems.py
  - insights
    - query_processor.py
    - graph.py
    - insight_configuration.proto
    - server.py
    - polymer
      - explore_view
        explore-view.js
        explore-view.html
      - query_card
        query-card.js
        query-card.html
      - insights_app
        insights-app.html
        insights-app.js
      - language_selector
        language-selector-content.html
        language-selector-content.js
        language-selector.html
        language-selector.js
      - attention_visualization
        attention-visualization.js
        attention-visualization.html
      - processing_visualization
        processing-visualization.js
        processing-visualization.html
      - bower.json
      - translation_result
        translation-result.html
        translation-result.js
      - common-types.js
      - graph_visualization
        graph-visualization.html
        graph-visualization.js
      - .bowerrc
      - tensor2tensor.html
      - index.html
    - __init__.py
    - README.md
    - transformer_model.py
  - envs
    - time_step_test.py
    - mujoco_problems.py
    - env_problem_utils.py
    - trajectory.py
    - gym_env_problem.py
    - gym_env_problem_test.py
    - tic_tac_toe_env_test.py
    - tic_tac_toe_env_problem.py
    - time_step.py
    - trajectory_test.py
    - env_problem.py
    - tic_tac_toe_env.py
    - gym_spaces_utils_test.py
    - env_problem_utils_test.py
    - tic_tac_toe_env_problem_test.py
    - __init__.py
    - mujoco_problems_test.py
    - rendered_env_problem_test.py
    - rendered_env_problem.py
    - gym_spaces_utils.py
- LICENSE
- CONTRIBUTING.md
- ISSUE_TEMPLATE.md
- setup.py
- .travis.yml
- README.md
- floyd_requirements.txt
- pylintrc
- .gitignore
- docs
  - multi_problem.md
  - new_model.md
  - walkthrough.md
  - cloud_mlengine.md
  - index.md
  - overview.md
  - tutorials
    - asr_with_transformer.md
  - distributed_training.md
  - new_problem.md
  - cloud_tpu.md
- oss_scripts
  - oss_integration_test.sh
  - oss_pip_install.sh
  - oss_tests.sh
  - oss_release.sh

# coding=utf-8
# Copyright 2020 The Tensor2Tensor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for area attention."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl.testing import parameterized
import numpy as np
from tensor2tensor.layers import area_attention
import tensorflow.compat.v1 as tf


class AreaAttentionTest(parameterized.TestCase, tf.test.TestCase):

  def testComputeAreaFeatures1D(self):
    features = tf.constant([[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]],
                            [[1.1, 2.1], [3.1, 4.1], [5.1, 6.1], [7.1, 8.1],
                             [9.1, 10.1]]],
                           dtype=tf.float32)
    area_mean, area_std, area_sum, area_height, area_widths = (
        area_attention.compute_area_features(features, max_area_width=3,
                                             epsilon=0.))
    with self.test_session() as session:
      session.run(tf.global_variables_initializer())
      res1, res2, res3, res4, res5 = session.run([area_mean, area_std, area_sum,
                                                  area_height, area_widths])
    self.assertAllClose(((((1, 2), (3, 4), (5, 6), (7, 8), (9, 10),
                           (2, 3), (4, 5), (6, 7), (8, 9),
                           (3, 4), (5, 6), (7, 8)),
                          ((1.1, 2.1), (3.1, 4.1), (5.1, 6.1), (7.1, 8.1),
                           (9.1, 10.1),
                           (2.1, 3.1), (4.1, 5.1), (6.1, 7.1), (8.1, 9.1),
                           (3.1, 4.1), (5.1, 6.1), (7.1, 8.1)))),
                        res1,
                        msg="mean_1d")
    expected_std = np.array([[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
                              [1, 1], [1, 1], [1, 1], [1, 1],
                              [1.63299, 1.63299], [1.63299, 1.63299],
                              [1.63299, 1.63299]],
                             [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0],
                              [1, 1], [1, 1], [1, 1], [1, 1],
                              [1.63299, 1.63299], [1.63299, 1.63299],
                              [1.63299, 1.63299]]])
    self.assertAllClose(expected_std, res2, atol=1e-2, msg="std_1d")
    self.assertAllClose([[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
                          [4, 6], [8, 10], [12, 14], [16, 18],
                          [9, 12], [15, 18], [21, 24]],
                         [[1.1, 2.1], [3.1, 4.1], [5.1, 6.1], [7.1, 8.1],
                          [9.1, 10.1],
                          [4.2, 6.2], [8.2, 10.2], [12.2, 14.2], [16.2, 18.2],
                          [9.3, 12.3], [15.3, 18.3], [21.3, 24.3]]],
                        res3,
                        msg="sum_1d")
    self.assertAllEqual([[[1], [1], [1], [1], [1],
                          [1], [1], [1], [1],
                          [1], [1], [1]],
                         [[1], [1], [1], [1], [1],
                          [1], [1], [1], [1],
                          [1], [1], [1]]],
                        res4,
                        msg="height_1d")
    self.assertAllEqual([[[1], [1], [1], [1], [1],
                          [2], [2], [2], [2],
                          [3], [3], [3]],
                         [[1], [1], [1], [1], [1],
                          [2], [2], [2], [2],
                          [3], [3], [3]]],
                        res5,
                        msg="width_1d")

  def testComputeAreaFeatures2D(self):
    features = tf.constant([[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]],
                            [[1.1, 2.1], [3.1, 4.1], [5.1, 6.1], [7.1, 8.1],
                             [9.1, 10.1], [11.1, 12.1]]],
                           dtype=tf.float32)
    area_mean, area_std, area_sum, area_height, area_widths = (
        area_attention.compute_area_features(features, max_area_width=3,
                                             max_area_height=2,
                                             height=2, epsilon=0.))
    with self.test_session() as session:
      session.run(tf.global_variables_initializer())
      res1, _, res3, res4, res5 = session.run([area_mean, area_std, area_sum,
                                               area_height, area_widths])
    expected_means = [[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                       [2, 3], [4, 5], [8, 9], [10, 11],
                       [3, 4], [9, 10],
                       [4, 5], [6, 7], [8, 9],
                       [5, 6], [7, 8],
                       [6, 7]],
                      [[1.1, 2.1], [3.1, 4.1], [5.1, 6.1], [7.1, 8.1],
                       [9.1, 10.1], [11.1, 12.1],
                       [2.1, 3.1], [4.1, 5.1], [8.1, 9.1], [10.1, 11.1],
                       [3.1, 4.1], [9.1, 10.1],
                       [4.1, 5.1], [6.1, 7.1], [8.1, 9.1],
                       [5.1, 6.1], [7.1, 8.1],
                       [6.1, 7.1]]]
    self.assertAllClose(expected_means, res1, msg="mean_1d")
    expected_heights = [[[1], [1], [1], [1], [1], [1],
                         # 1x2
                         [1], [1], [1], [1],
                         # 1x3
                         [1], [1],
                         # 2x1
                         [2], [2], [2],
                         # 2x2
                         [2], [2],
                         # 2x3
                         [2]],
                        [[1], [1], [1], [1], [1], [1],
                         # 1x2
                         [1], [1], [1], [1],
                         # 1x3
                         [1], [1],
                         # 2x1
                         [2], [2], [2],
                         # 2x2
                         [2], [2],
                         # 2x3
                         [2]]]
    self.assertAllEqual(expected_heights, res4, msg="height_1d")
    expected_widths = [[[1], [1], [1], [1], [1], [1],
                        # 1x2
                        [2], [2], [2], [2],
                        # 1x3
                        [3], [3],
                        # 2x1
                        [1], [1], [1],
                        # 2x2
                        [2], [2],
                        # 2x3
                        [3]],
                       [[1], [1], [1], [1], [1], [1],
                        # 1x2
                        [2], [2], [2], [2],
                        # 1x3
                        [3], [3],
                        # 2x1
                        [1], [1], [1],
                        # 2x2
                        [2], [2],
                        # 2x3
                        [3]]]
    self.assertAllEqual(expected_widths, res5, msg="width_1d")
    sizes = np.multiply(np.array(expected_heights), np.array(expected_widths))
    expected_sums = np.multiply(np.array(expected_means), sizes)
    self.assertAllClose(expected_sums, res3, msg="sum_1d")

  def testAreaMean(self):
    batch_size = 256
    feature_len = 100
    memory_height = 10
    heads = 2
    key_len = 2
    depth = 128
    max_area_height = 3
    max_area_width = 3
    queries = tf.random_uniform([batch_size, heads, key_len, depth],
                                minval=-10.0, maxval=10.0)
    features = tf.random_uniform([batch_size, heads, feature_len, depth],
                                 minval=-10.0, maxval=10.0)
    target_values = tf.random_uniform([batch_size, heads, key_len, depth],
                                      minval=-0.2, maxval=0.2)
    keys = tf.layers.dense(features, units=depth)
    values = tf.layers.dense(features, units=depth)
    mean_attention = area_attention.dot_product_area_attention(
        queries, keys, values,
        bias=None,
        area_key_mode="mean",
        name="mean_key",
        max_area_width=max_area_width,
        max_area_height=max_area_height,
        memory_height=memory_height)
    mean_gradients = tf.gradients(
        tf.reduce_mean(
            tf.pow(target_values - mean_attention, 2)), features)
    with self.test_session() as session:
      session.run(tf.global_variables_initializer())
      result = session.run([mean_gradients])
    self.assertFalse(np.any(np.logical_not(np.isfinite(result))))

  def test2DAreaMax(self):
    batch_size = 256
    feature_len = 100
    memory_height = 10
    heads = 2
    key_len = 6
    depth = 128
    max_area_height = 3
    max_area_width = 3
    queries = tf.random_uniform([batch_size, heads, key_len, depth],
                                minval=-10.0, maxval=10.0)
    features = tf.random_uniform([batch_size, heads, feature_len, depth],
                                 minval=-10.0, maxval=10.0)
    target_values = tf.random_uniform([batch_size, heads, key_len, depth],
                                      minval=-0.2, maxval=0.2)
    keys = tf.layers.dense(features, units=depth)
    values = tf.layers.dense(features, units=depth)
    max_attention = area_attention.dot_product_area_attention(
        queries, keys, values,
        bias=None,
        area_key_mode="max",
        area_value_mode="max",
        name="max_key",
        max_area_width=max_area_width,
        max_area_height=max_area_height,
        memory_height=memory_height)
    max_gradients = tf.gradients(tf.reduce_mean(
        tf.pow(target_values - max_attention, 2)), features)
    with self.test_session() as session:
      session.run(tf.global_variables_initializer())
      result1, result2 = session.run([max_gradients, max_attention])
    self.assertFalse(np.any(np.logical_not(np.isfinite(result1))))
    self.assertFalse(np.any(np.logical_not(np.isfinite(result2))))

  def test1DAreaMax(self):
    batch_size = 256
    feature_len = 100
    heads = 2
    key_len = 15
    depth = 128
    max_area_width = 3
    queries = tf.random_uniform([batch_size, heads, key_len, depth],
                                minval=-10.0, maxval=10.0)
    features = tf.random_uniform([batch_size, heads, feature_len, depth],
                                 minval=-10.0, maxval=10.0)
    feature_length = tf.constant(
        np.concatenate(
            (np.random.randint(max_area_width, feature_len, [batch_size - 1]),
             np.array([feature_len])), axis=0), tf.int32)
    base_mask = tf.expand_dims(tf.sequence_mask(feature_length), 1)
    mask = tf.expand_dims(base_mask, 3)
    mask = tf.tile(mask, [1, heads, 1, depth])
    features = tf.where(mask, features, tf.zeros_like(features))
    # [batch, 1, 1, memory_length]
    bias_mask = tf.expand_dims(base_mask, 1)
    bias = tf.where(
        bias_mask,
        tf.zeros_like(bias_mask, tf.float32),
        tf.ones_like(bias_mask, tf.float32) * -1e9)
    target_values = tf.random_uniform([batch_size, heads, key_len, depth],
                                      minval=-0.2, maxval=0.2)
    keys = tf.layers.dense(features, units=depth)
    values = tf.layers.dense(features, units=depth)
    max_attention = area_attention.dot_product_area_attention(
        queries, keys, values,
        bias=bias,
        area_key_mode="max",
        area_value_mode="max",
        name="max_key",
        max_area_width=max_area_width)
    max_gradients = tf.gradients(
        tf.reduce_mean(
            tf.pow(target_values - max_attention, 2)), features)
    with self.test_session() as session:
      session.run(tf.global_variables_initializer())
      result1, result2 = session.run([max_gradients, max_attention])
    self.assertFalse(np.any(np.logical_not(np.isfinite(result1))))
    self.assertFalse(np.any(np.logical_not(np.isfinite(result2))))

if __name__ == "__main__":
  tf.test.main()