python source code of speech

fine-lm-master
- src
  - tensor2tensor
    - AUTHORS
    - tensor2tensor
      - serving
        export.py
        query.py
        __init__.py
        README.md
        serving_utils.py
      - utils
        video2gif.py
        avg_checkpoints.py
        expert_utils_test.py
        data_reader_test.py
        quantization.py
        trainer_lib_test.py
        modality.py
        cloud_tpu.py
        usr_dir.py
        get_rouge.py
        metrics_test.py
        learning_rate.py
        devices.py
        yellowfin_test.py
        metrics.py
        cloud_mlengine.py
        decoding.py
        adv_attack_utils.py
        rouge.py
        video_metrics.py
        diet_test.py
        flags.py
        adafactor.py
        get_ende_bleu.sh
        diet.py
        data_reader.py
        rouge_test.py
        registry.py
        restore_hook.py
        checkpoint_compatibility_test.py
        t2t_model.py
        yellowfin.py
        __init__.py
        multistep_optimizer_test.py
        registry_test.py
        bleu_hook_test.py
        optimize.py
        expert_utils.py
        compute_video_metrics.py
        metrics_hook_test.py
        beam_search.py
        beam_search_test.py
        trainer_lib.py
        get_cnndm_rouge.sh
        metrics_hook.py
        multistep_optimizer.py
        bleu_hook.py
      - layers
        discretization.py
        latent_layers.py
        discretization_test.py
        rev_block_test.py
        common_hparams.py
        modalities_test.py
        common_image_attention.py
        rev_block.py
        common_message_passing_attention.py
        common_layers.py
        common_attention.py
        common_attention_test.py
        common_image_attention_test.py
        __init__.py
        modalities.py
        common_layers_test.py
      - models
        revnet_test.py
        slicenet.py
        distillation.py
        transformer.py
        basic_test.py
        slicenet_test.py
        neural_gpu.py
        basic.py
        bytenet_test.py
        image_transformer_2d.py
        xception.py
        bytenet.py
        image_transformer_2d_test.py
        resnet.py
        lstm_test.py
        image_transformer.py
        lstm.py
        xception_test.py
        __init__.py
        README.md
        shake_shake.py
        transformer_test.py
        resnet_test.py
        research
        universal_transformer_util.py
        rl.py
        autoencoders.py
        transformer_moe.py
        cycle_gan.py
        autoencoders_test.py
        gene_expression.py
        multimodel.py
        attention_lm_moe.py
        transformer_vae_test.py
        gene_expression_test.py
        transformer_revnet.py
        adafactor_experiments.py
        transformer_nat.py
        transformer_revnet_test.py
        transformer_vae.py
        attention_lm.py
        universal_transformer_test.py
        next_frame.py
        lm_experiments.py
        transformer_symshard.py
        super_lm.py
        transformer_sketch.py
        multimodel_test.py
        __init__.py
        universal_transformer.py
        next_frame_test.py
        aligned.py
        neural_gpu_test.py
        revnet.py
        image_transformer_test.py
        vanilla_gan.py
      - rl
        model_rl_experiment_test.py
        rl_trainer_lib_test.py
        ppo.py
        model_rl_experiment_stochastic_test.py
        t2t_rl_trainer.py
        model_rl_experiment.py
        rl_trainer_lib.py
        __init__.py
        README.md
        collect.py
        envs
        py_func_batch_env.py
        batch_env_factory.py
        tf_atari_wrappers.py
        in_graph_batch_env.py
        batch_env.py
        __init__.py
        utils.py
        simulated_batch_env.py
      - test_data
        example_usr_dir
        my_submodule.py
        __init__.py
        requirements.txt
        vocab.ende.8192
        transformer_test_ckpt
        flags.txt
        model.ckpt-1.index
        hparams.json
        checkpoint
        model.ckpt-1.data-00000-of-00002
      - notebooks
        hello_t2t-rl.ipynb
        asr_transformer.ipynb
      - data_generators
        translate_test.py
        translate_enmk.py
        snli.py
        translate_enzh.py
        gym_utils.py
        twentybn.py
        mscoco_test.py
        speech_recognition.py
        audio_test.py
        gym_problems.py
        imdb.py
        gym_problems_specs.py
        timeseries_data_generator.py
        desc2code.py
        wsj_parsing.py
        tokenizer.py
        cipher.py
        gene_expression.py
        wikitext103.py
        mnist.py
        celeba.py
        librispeech.py
        translate_ende.py
        timeseries.py
        gene_expression_test.py
        ptb.py
        problem_hparams.py
        dna_encoder_test.py
        translate_enid.py
        text_encoder_test.py
        lm1b.py
        generator_utils.py
        dna_encoder.py
        cnn_dailymail.py
        wnli.py
        fsns.py
        mscoco.py
        celeba_test.py
        imagenet_test.py
        babi_qa.py
        desc2code_test.py
        text_encoder_build_subword.py
        algorithmic_test.py
        tokenizer_test.py
        video_generated.py
        program_search.py
        image_utils.py
        audio.py
        ice_parsing.py
        image_lsun.py
        algorithmic_math.py
        style_transfer.py
        test_data
        vocab-1.txt
        vocab-2.txt
        corpus-1.txt
        corpus-2.txt
        multi_problem.py
        algorithmic.py
        rte.py
        video_utils.py
        ocr.py
        problem.py
        subject_verb_agreement.py
        timeseries_data_generator_test.py
        bair_robot_pushing.py
        sst_binary.py
        image_utils_test.py
        gym_problems_test.py
        text_problems.py
        translate_enet.py
        generator_utils_test.py
        gh_function_docstring_encoder.py
        __init__.py
        text_problems_test.py
        common_voice.py
        README.md
        google_robot_pushing.py
        timeseries_test.py
        text_encoder.py
        program_search_test.py
        wikisum
        delete_instances.sh
        validate_data.py
        wikisum.py
        utils_test.py
        parallel_launch.py
        get_references_web_single_group.py
        test_data
        para_good1.txt
        para_bad1.txt
        get_references_commoncrawl.py
        get_references_web.py
        produce_examples.py
        __init__.py
        README.md
        utils.py
        generate_vocab.py
        translate_envi.py
        multinli.py
        all_problems.py
        algorithmic_math_test.py
        translate.py
        lm1b_imdb.py
        inspect_tfrecord.py
        cola.py
        imagenet.py
        cifar.py
        lambada.py
        wiki.py
        qnli.py
        translate_enfr.py
        quora_qpairs.py
        translate_encs.py
        squad.py
      - __init__.py
      - visualization
        visualization_test.py
        attention.py
        TransformerVisualization.ipynb
        attention.js
        __init__.py
        visualization.py
      - bin
        t2t-trainer
        t2t_trainer.py
        t2t_translate_all.py
        t2t-translate-all
        t2t_datagen.py
        t2t-decoder
        t2t-make-tf-configs
        t2t-avg-all
        make_tf_configs.py
        t2t_avg_all.py
        t2t_trainer_test.py
        t2t_distill.py
        t2t_attack.py
        t2t-exporter
        t2t-bleu
        __init__.py
        t2t-insights-server
        t2t_bleu.py
        t2t_decoder.py
        t2t-datagen
        t2t-query-server
      - problems_test.py
      - problems.py
      - insights
        query_processor.py
        graph.py
        insight_configuration.proto
        server.py
        polymer
        explore_view
        explore-view.js
        explore-view.html
        query_card
        query-card.js
        query-card.html
        insights_app
        insights-app.html
        insights-app.js
        language_selector
        language-selector-content.html
        language-selector-content.js
        language-selector.html
        language-selector.js
        attention_visualization
        attention-visualization.js
        attention-visualization.html
        processing_visualization
        processing-visualization.js
        processing-visualization.html
        bower.json
        translation_result
        translation-result.html
        translation-result.js
        common-types.js
        graph_visualization
        graph-visualization.html
        graph-visualization.js
        .bowerrc
        tensor2tensor.html
        index.html
        __init__.py
        README.md
        transformer_model.py
    - LICENSE
    - CONTRIBUTING.md
    - ISSUE_TEMPLATE.md
    - setup.py
    - .travis.yml
    - README.md
    - pylintrc
    - .gitignore
    - docs
      - new_model.md
      - walkthrough.md
      - cloud_mlengine.md
      - index.md
      - overview.md
      - tutorials
        asr_with_transformer.md
      - distributed_training.md
      - new_problem.md
      - cloud_tpu.md
  - README.md
  - scripts
    - opennmt-unmt
      - train.sh
      - train.py
      - inference.py
    - en-lm.sh
    - gcloud-ctpu-startup.sh
- LICENSE
- imgs
- presentations
  - intro
    - xaringan-themer.css
    - libs
      - dt-core
        js
        jquery.dataTables.min.js
        css
        jquery.dataTables.min.css
        jquery.dataTables.extra.css
      - jquery
        jquery.min.js
        LICENSE.txt
      - crosstalk
        js
        crosstalk.min.js.map
        crosstalk.js.map
        crosstalk.min.js
        crosstalk.js
        css
        crosstalk.css
      - datatables-css
        datatables-crosstalk.css
      - datatables-binding
        datatables.js
      - htmlwidgets
        htmlwidgets.js
    - imgs
      - stupendousman.jpeg
    - intro-slides_files
      - figure-html
        cars-1.svg
    - index.html
    - intro-slides.html
    - index.Rmd
  - lit
    - delayed-impact
      - xaringan-themer.css
      - imgs
      - icml.html
      - icml.Rmd
  - progress
    - xaringan-themer.css
    - libs
      - dt-core
        js
        jquery.dataTables.min.js
        css
        jquery.dataTables.min.css
        jquery.dataTables.extra.css
      - jquery
        jquery.min.js
        LICENSE.txt
      - crosstalk
        js
        crosstalk.min.js.map
        crosstalk.js.map
        crosstalk.min.js
        crosstalk.js
        css
        crosstalk.css
      - datatables-css
        datatables-crosstalk.css
      - datatables-binding
        datatables.js
      - htmlwidgets
        htmlwidgets.js
    - imgs
      - stupendousman.jpeg
    - index.html
    - index.Rmd
- README.md
- .gitignore
- docs
  - xaringan-themer.css
  - libs
    - dt-core
      - js
        jquery.dataTables.min.js
      - css
        jquery.dataTables.min.css
        jquery.dataTables.extra.css
    - jquery
      - jquery.min.js
      - LICENSE.txt
    - crosstalk
      - js
        crosstalk.min.js.map
        crosstalk.js.map
        crosstalk.min.js
        crosstalk.js
      - css
        crosstalk.css
    - datatables-css
      - datatables-crosstalk.css
    - datatables-binding
      - datatables.js
    - htmlwidgets
      - htmlwidgets.js
  - imgs
    - stupendousman.jpeg
  - index_files
    - figure-html
  - index.html
  - index.Rmd
- fine-lm.Rproj

# coding=utf-8
# Copyright 2018 The Tensor2Tensor Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Common classes for automatic speech recognition (ASR) datasets.

The audio import uses sox to generate normalized waveforms, please install
it as appropriate (e.g. using apt-get or yum).
"""

import functools
import os
from subprocess import call
import tempfile
import numpy as np
from scipy.io import wavfile
import scipy.signal

from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.layers import common_attention
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import metrics
from tensor2tensor.utils import modality
from tensor2tensor.utils import registry

import tensorflow as tf


#
# ASR Feature pipeline in TF.
#
def add_delta_deltas(filterbanks, name=None):
  """Compute time first and second-order derivative channels.

  Args:
    filterbanks: float32 tensor with shape [batch_size, len, num_bins, 1]
    name: scope name

  Returns:
    float32 tensor with shape [batch_size, len, num_bins, 3]
  """
  delta_filter = np.array([2, 1, 0, -1, -2])
  delta_delta_filter = scipy.signal.convolve(delta_filter, delta_filter, "full")

  delta_filter_stack = np.array(
      [[0] * 4 + [1] + [0] * 4, [0] * 2 + list(delta_filter) + [0] * 2,
       list(delta_delta_filter)],
      dtype=np.float32).T[:, None, None, :]

  delta_filter_stack /= np.sqrt(
      np.sum(delta_filter_stack**2, axis=0, keepdims=True))

  filterbanks = tf.nn.conv2d(
      filterbanks, delta_filter_stack, [1, 1, 1, 1], "SAME", data_format="NHWC",
      name=name)
  return filterbanks


def compute_mel_filterbank_features(
    waveforms,
    sample_rate=16000, dither=1.0 / np.iinfo(np.int16).max, preemphasis=0.97,
    frame_length=25, frame_step=10, fft_length=None,
    window_fn=functools.partial(tf.contrib.signal.hann_window, periodic=True),
    lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80,
    log_noise_floor=1e-3, apply_mask=True):
  """Implement mel-filterbank extraction using tf ops.

  Args:
    waveforms: float32 tensor with shape [batch_size, max_len]
    sample_rate: sampling rate of the waveform
    dither: stddev of Gaussian noise added to waveform to prevent quantization
      artefacts
    preemphasis: waveform high-pass filtering constant
    frame_length: frame length in ms
    frame_step: frame_Step in ms
    fft_length: number of fft bins
    window_fn: windowing function
    lower_edge_hertz: lowest frequency of the filterbank
    upper_edge_hertz: highest frequency of the filterbank
    num_mel_bins: filterbank size
    log_noise_floor: clip small values to prevent numeric overflow in log
    apply_mask: When working on a batch of samples, set padding frames to zero
  Returns:
    filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1]
  """
  # `stfts` is a complex64 Tensor representing the short-time Fourier
  # Transform of each signal in `signals`. Its shape is
  # [batch_size, ?, fft_unique_bins]
  # where fft_unique_bins = fft_length // 2 + 1

  # Find the wave length: the largest index for which the value is !=0
  # note that waveforms samples that are exactly 0.0 are quite common, so
  # simply doing sum(waveforms != 0, axis=-1) will not work correctly.
  wav_lens = tf.reduce_max(
      tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) *
      tf.to_int32(tf.not_equal(waveforms, 0.0)),
      axis=-1) + 1
  if dither > 0:
    waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither)
  if preemphasis > 0:
    waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1]
    wav_lens -= 1
  frame_length = int(frame_length * sample_rate / 1e3)
  frame_step = int(frame_step * sample_rate / 1e3)
  if fft_length is None:
    fft_length = int(2**(np.ceil(np.log2(frame_length))))

  stfts = tf.contrib.signal.stft(
      waveforms,
      frame_length=frame_length,
      frame_step=frame_step,
      fft_length=fft_length,
      window_fn=window_fn,
      pad_end=True)

  stft_lens = (wav_lens + (frame_step - 1)) // frame_step
  masks = tf.to_float(tf.less_equal(
      tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0),
      tf.expand_dims(stft_lens, 1)))

  # An energy spectrogram is the magnitude of the complex-valued STFT.
  # A float32 Tensor of shape [batch_size, ?, 257].
  magnitude_spectrograms = tf.abs(stfts)

  # Warp the linear-scale, magnitude spectrograms into the mel-scale.
  num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
  linear_to_mel_weight_matrix = (
      tf.contrib.signal.linear_to_mel_weight_matrix(
          num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz,
          upper_edge_hertz))
  mel_spectrograms = tf.tensordot(
      magnitude_spectrograms, linear_to_mel_weight_matrix, 1)
  # Note: Shape inference for tensordot does not currently handle this case.
  mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate(
      linear_to_mel_weight_matrix.shape[-1:]))

  log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms))

  if apply_mask:
    log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1)

  return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")


#
# Audio problem definition
#
class AudioEncoder(object):
  """Encoder class for saving and loading waveforms."""

  def __init__(self, num_reserved_ids=0, sample_rate=16000):
    assert num_reserved_ids == 0
    self._sample_rate = sample_rate

  @property
  def num_reserved_ids(self):
    return 0

  def encode(self, s):
    """Transform a string with a filename into a list of float32.

    Args:
      s: path to the file with a waveform.

    Returns:
      samples: list of int16s
    """
    # Make sure that the data is a single channel, 16bit, 16kHz wave.
    # TODO(chorowski): the directory may not be writable, this should fallback
    # to a temp path, and provide instructions for installing sox.
    if s.endswith(".mp3"):
      # TODO(dliebling) On Linux, check if libsox-fmt-mp3 is installed.
      out_filepath = s[:-4] + ".wav"
      call([
          "sox", "--guard", s, "-r", "16k", "-b", "16", "-c", "1", out_filepath
      ])
      s = out_filepath
    elif not s.endswith(".wav"):
      out_filepath = s + ".wav"
      if not os.path.exists(out_filepath):
        call(["sox", "-r", "16k", "-b", "16", "-c", "1", s, out_filepath])
      s = out_filepath
    rate, data = wavfile.read(s)
    assert rate == self._sample_rate
    assert len(data.shape) == 1
    if data.dtype not in [np.float32, np.float64]:
      data = data.astype(np.float32) / np.iinfo(data.dtype).max
    return data.tolist()

  def decode(self, ids):
    """Transform a sequence of float32 into a waveform.

    Args:
      ids: list of integers to be converted.

    Returns:
      Path to the temporary file where the waveform was saved.

    Raises:
      ValueError: if the ids are not of the appropriate size.
    """
    _, tmp_file_path = tempfile.mkstemp()
    wavfile.write(tmp_file_path, self._sample_rate, np.asarray(ids))
    return tmp_file_path

  def decode_list(self, ids):
    """Transform a sequence of int ids into an image file.

    Args:
      ids: list of integers to be converted.

    Returns:
      Singleton list: path to the temporary file where the wavfile was saved.
    """
    return [self.decode(ids)]

  @property
  def vocab_size(self):
    return 256


class ByteTextEncoderWithEos(text_encoder.ByteTextEncoder):
  """Encodes each byte to an id and appends the EOS token."""

  def encode(self, s):
    return super(ByteTextEncoderWithEos, self).encode(s) + [text_encoder.EOS_ID]


class SpeechRecognitionProblem(problem.Problem):
  """Base class for speech recognition problems."""

  def hparams(self, defaults, model_hparams):
    p = model_hparams
    # Filterbank extraction in bottom instead of preprocess_example is faster.
    p.add_hparam("audio_preproc_in_bottom", False)
    # The trainer seems to reserve memory for all members of the input dict
    p.add_hparam("audio_keep_example_waveforms", False)
    p.add_hparam("audio_sample_rate", 16000)
    p.add_hparam("audio_preemphasis", 0.97)
    p.add_hparam("audio_dither", 1.0 / np.iinfo(np.int16).max)
    p.add_hparam("audio_frame_length", 25.0)
    p.add_hparam("audio_frame_step", 10.0)
    p.add_hparam("audio_lower_edge_hertz", 20.0)
    p.add_hparam("audio_upper_edge_hertz", 8000.0)
    p.add_hparam("audio_num_mel_bins", 80)
    p.add_hparam("audio_add_delta_deltas", True)
    p.add_hparam("num_zeropad_frames", 250)

    p = defaults
    # p.stop_at_eos = int(False)
    p.input_modality = {"inputs": ("audio:speech_recognition_modality", None)}
    p.target_modality = (registry.Modalities.SYMBOL, 256)

  @property
  def is_character_level(self):
    return True

  @property
  def input_space_id(self):
    return problem.SpaceID.AUDIO_SPECTRAL

  @property
  def target_space_id(self):
    return problem.SpaceID.EN_CHR

  def feature_encoders(self, _):
    return {
        "inputs": None,  # Put None to make sure that the logic in
                         # decoding.py doesn't try to convert the floats
                         # into text...
        "waveforms": AudioEncoder(),
        "targets": ByteTextEncoderWithEos(),
    }

  def example_reading_spec(self):
    data_fields = {
        "waveforms": tf.VarLenFeature(tf.float32),
        "targets": tf.VarLenFeature(tf.int64),
    }

    data_items_to_decoders = None

    return data_fields, data_items_to_decoders

  def preprocess_example(self, example, mode, hparams):
    p = hparams
    if p.audio_preproc_in_bottom:
      example["inputs"] = tf.expand_dims(
          tf.expand_dims(example["waveforms"], -1), -1)
    else:
      waveforms = tf.expand_dims(example["waveforms"], 0)
      mel_fbanks = compute_mel_filterbank_features(
          waveforms,
          sample_rate=p.audio_sample_rate,
          dither=p.audio_dither,
          preemphasis=p.audio_preemphasis,
          frame_length=p.audio_frame_length,
          frame_step=p.audio_frame_step,
          lower_edge_hertz=p.audio_lower_edge_hertz,
          upper_edge_hertz=p.audio_upper_edge_hertz,
          num_mel_bins=p.audio_num_mel_bins,
          apply_mask=False)
      if p.audio_add_delta_deltas:
        mel_fbanks = add_delta_deltas(mel_fbanks)
      fbank_size = common_layers.shape_list(mel_fbanks)
      assert fbank_size[0] == 1

      # This replaces CMVN estimation on data
      var_epsilon = 1e-09
      mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
      variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                keepdims=True, axis=1)
      mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon)

      # Later models like to flatten the two spatial dims. Instead, we add a
      # unit spatial dim and flatten the frequencies and channels.
      example["inputs"] = tf.concat([
          tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]),
          tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))], 0)

    if not p.audio_keep_example_waveforms:
      del example["waveforms"]
    return super(SpeechRecognitionProblem, self
                ).preprocess_example(example, mode, hparams)

  def eval_metrics(self):
    defaults = super(SpeechRecognitionProblem, self).eval_metrics()
    return defaults + [metrics.Metrics.EDIT_DISTANCE]


@registry.register_audio_modality
class SpeechRecognitionModality(modality.Modality):
  """Common ASR filterbank processing."""

  def bottom(self, x):
    """Use batchnorm instead of CMVN and shorten the stft with strided convs.

    Args:
      x: float32 tensor with shape [batch_size, len, 1, freqs * channels]

    Returns:
      float32 tensor with shape [batch_size, shorter_len, 1, hidden_size]
    """
    inputs = x
    p = self._model_hparams

    num_mel_bins = p.audio_num_mel_bins
    num_channels = 3 if p.audio_add_delta_deltas else 1

    with tf.variable_scope(self.name):
      if p.audio_preproc_in_bottom:
        # Compute filterbanks
        with tf.variable_scope("fbanks"):
          waveforms = tf.squeeze(inputs, [2, 3])
          mel_fbanks = compute_mel_filterbank_features(
              waveforms,
              sample_rate=p.audio_sample_rate,
              dither=p.audio_dither,
              preemphasis=p.audio_preemphasis,
              frame_length=p.audio_frame_length,
              frame_step=p.audio_frame_step,
              lower_edge_hertz=p.audio_lower_edge_hertz,
              upper_edge_hertz=p.audio_upper_edge_hertz,
              num_mel_bins=p.audio_num_mel_bins,
              apply_mask=True)
          if p.audio_add_delta_deltas:
            mel_fbanks = add_delta_deltas(mel_fbanks)
          x = tf.reshape(mel_fbanks,
                         common_layers.shape_list(mel_fbanks)[:2] +
                         [num_mel_bins, num_channels])

          nonpadding_mask = 1. - common_attention.embedding_to_padding(x)
          num_of_nonpadding_elements = tf.reduce_sum(
              nonpadding_mask) * num_mel_bins * num_channels

          # This replaces CMVN estimation on data
          var_epsilon = 1e-09
          mean = tf.reduce_sum(
              x, axis=[1], keepdims=True) / num_of_nonpadding_elements
          variance = (num_of_nonpadding_elements * mean**2. -
                      2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) +
                      tf.reduce_sum(x**2, axis=[1], keepdims=True)
                     ) / num_of_nonpadding_elements
          x = (x - mean) * tf.rsqrt(variance + var_epsilon) * tf.expand_dims(
              nonpadding_mask, -1)
      else:
        x = inputs

      # The convention is that the models are flattened along the spatial,
      # dimensions, thus the speech preprocessor treats frequencies and
      # channels as image colors (last axis)
      x.set_shape([None, None, num_mel_bins, num_channels])

      # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding?
      x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]])
      for _ in range(2):
        x = tf.layers.conv2d(
            x, 128, (3, 3), (2, 2), use_bias=False)
        x = common_layers.layer_norm(x)
        x = tf.nn.relu(x)

      xshape = common_layers.shape_list(x)
      # apply a conv that will remove all frequencies and at the same time
      # project the output into desired hidden_size
      x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
      x = tf.layers.conv2d(x, p.hidden_size, (3, xshape[2]), use_bias=False)

      assert common_layers.shape_list(x)[2] == 1
      x = common_layers.layer_norm(x)
      x = tf.nn.relu(x)
    return x