"""Modules used by the Fast SQuAD model.

import tensorflow as tf
import tensorflow.contrib.layers as tf_layers
from math import log as math_log

class BiDAFAttn(object):
    """Module for Bi-Directional Attention Flow attention.
     Based on the paper "Bi-Directional Attention Flow for Machine Comprehension" by Seo et al., 2017.

    Computes a similarity matrix with tri-linear function of context, question, and context * question.
    This similarity matrix is then used to compute context-to-question (C2Q) attention and
    (optionally) question-to-context (Q2C) attention.

    def __init__(self, keep_prob, l2_lambda=3e-7):
        """Initialize a BiDAFAttn module.

          keep_prob: float. Probability of keeping a node, passed to tf.nn.dropout.
          l2_lambda: float. L2 regularization factor.
        self.keep_prob = keep_prob
        self.l2_lambda = l2_lambda

    def _similarity_matrix(self, c_vecs, num_c_vecs, q_vecs, num_q_vecs, vec_size):
        """Build similarity matrix for Bidirectional attention.
        Use tri-linear function of context, question, and context * question.

          c_vecs: tensor. Context vectors. Shape (batch_size, c_len, num_c_vecs).
          num_c_vecs: tensor. Number of context vectors.
          q_vecs: tensor. Question vectors. Shape (batch_size, q_len, num_q_vecs).
          num_q_vecs: tensor. Number of question vectors.
          vec_size: int. Size of each context and question vector.
          sim_mat: tensor. Similarity matrix of shape (batch_size, num_c_vecs, num_q_vecs).
        # Prepare each of the inputs for the linearity.
        x_c = tf.reshape(c_vecs, [-1, vec_size])
        x_q = tf.reshape(q_vecs, [-1, vec_size])
        x_cq = tf.reshape(tf.expand_dims(c_vecs, 2) * tf.expand_dims(q_vecs, 1), [-1, vec_size])

        # Perform dropout on each input.
        x_c = tf.nn.dropout(x_c, self.keep_prob)
        x_q = tf.nn.dropout(x_q, self.keep_prob)
        x_cq = tf.nn.dropout(x_cq, self.keep_prob)

        # For memory efficiency, we compute the linearity piecewise over c, q, and c*q.
        y_c = tf_layers.fully_connected(x_c, 1, activation_fn=None,
        y_q = tf_layers.fully_connected(x_q, 1, activation_fn=None,
        y_cq = tf_layers.fully_connected(x_cq, 1, activation_fn=None,

        # Prepare to add each component together.
        y_c = tf.reshape(y_c, [-1, num_c_vecs, 1])
        y_q = tf.reshape(y_q, [-1, 1, num_q_vecs])
        y_cq = tf.reshape(y_cq, [-1, num_c_vecs, num_q_vecs])

        # Combine the piecewise linearities to get the full product W_sim^T [c; q; c*q].
        sim_mat = y_c + y_q + y_cq

        return sim_mat

    def _beta_fn(c_vecs, c2q_attn, q2c_attn):
        """Apply the beta function to combine the context and attention matrices.

        Based on beta from the paper "Bidirectional Attention Flow for Machine Comprehension" by Seo et al., 2017.
        They simply concatenate [c, c2q_attn, c * c2q_attn, c * q2c_attn].

          c_vecs: tensor. Context vectors.
          c2q_attn: tensor. Context-to-query attention.
          q2c_attn: tensor or None. Query-to-context attention.
          beta: tensor. Result of applying the chosen beta function to the inputs.
        if q2c_attn is None:
            return tf.concat([c_vecs, c2q_attn, c_vecs * c2q_attn], axis=2)
        return tf.concat([c_vecs, c2q_attn, c_vecs * c2q_attn, c_vecs * q2c_attn], axis=2)

    def build_graph(self, c_vecs, c_mask, num_c_vecs, q_vecs, q_mask, num_q_vecs, use_q2c=True, scope="BiDAFAttn"):
        """Build the BiDAF attention layer component for the compute graph.

          c_vecs: tensor. Context vectors. Shape (batch_size, num_c_vecs, vec_size).
          c_mask: tensor. Mask for the context vectors. Shape (batch_size, num_c_vecs).
          num_c_vecs: tensor. Number of context vectors. Shape ().
          q_vecs: tensor. Question vectors. Shape (batch_size, num_q_vecs, vec_size).
          q_mask: tensor. Mask for the question vectors. Shape (batch_size, num_q_vecs).
          num_q_vecs: tensor. Number of question vectors. Shape ().
          use_q2c: bool. If true, include both C2Q and Q2C attention. If false, only use C2Q attention.
          scope: string. Name of scope to use for TensorFlow variables.
          attn_outputs: Tensor. Shape (batch_size, num_c_vecs, n*vec_size). If use_q2c, then n=4. Else n=3.
        with tf.variable_scope(scope):
            vec_size = c_vecs.get_shape().as_list()[2]

            # Calculate similarity matrix (batch_size, num_c_vecs, num_q_vecs)
            sim_mat = self._similarity_matrix(c_vecs, num_c_vecs, q_vecs, num_q_vecs, vec_size)

            # Calculate context-to-query attention
            q_mask_expanded = tf.expand_dims(q_mask, axis=1)
            _, sim_bar = masked_softmax(sim_mat, q_mask_expanded, 2)  # (batch_size, num_c_vecs, num_q_vecs)
            c2q_attn = tf.matmul(sim_bar, q_vecs)                     # (batch_size, num_c_vecs, vec_size)

            # Calculate query-to-context attention
            if use_q2c:
                c_mask_expanded = tf.expand_dims(c_mask, axis=2)
                _, sim_dbl_bar = masked_softmax(sim_mat, c_mask_expanded, 1)  # (batch_size, num_c_vecs, num_q_vecs)
                sim_dbl_bar = tf.transpose(sim_dbl_bar, (0, 2, 1))            # (batch_size, num_q_vecs, num_c_vecs)
                sim_sim = tf.matmul(sim_bar, sim_dbl_bar)                     # (batch_size, num_c_vecs, num_c_vecs)
                q2c_attn = tf.matmul(sim_sim, c_vecs)                         # (batch_size, num_c_vecs, vec_size)
                q2c_attn = None

            # Apply beta function to combine the context with the attention outputs
            outputs = self._beta_fn(c_vecs, c2q_attn, q2c_attn)

        return outputs

class CharLevelEncoder(object):
    Module for encoding a words based on their character embeddings.
    Performs a 1D convolution over the character embeddings, then performs max-pooling
    over each output feature for each word.

    Based on the paper "Character-Aware Neural Language Models" by Kim et al., 2015

    def __init__(self, char_emb_size, kernel_size, keep_prob):
        self.char_emb_size = char_emb_size
        self.kernel_size = kernel_size
        self.keep_prob = keep_prob
        self.is_training = self.keep_prob < (1. - 1e-5)

    def build_graph(self, char_embeddings, seq_len, word_len, scope="CharLevelEncoder", reuse=None):
        """Compute the char-level word embeddings for the given char_embeddings sequence.

          char_embeddings: tensor. Shape (batch_size, seq_len, word_len, char_emb_size).
          seq_len: tensor. Max sequence length. Shape ().
          word_len: int. Max word length.
          Tensor shape (batch_size, seq_len, char_embedding_size). The char-level word embedding
          for each word in the input tensor.
        with tf.variable_scope(scope, reuse=reuse):
            char_embeddings = tf.reshape(char_embeddings, [-1, word_len, self.char_emb_size])
            char_embeddings = tf.nn.dropout(char_embeddings, self.keep_prob)
            # VALID padding outperforms SAME here. Perhaps it dilutes the importance of prefixes and suffixes in words.
            char_embeddings = std_conv(char_embeddings, self.char_emb_size, self.kernel_size, padding="VALID",
                                       activation_fn=tf.nn.relu, reuse=reuse)  # (bs * seq_len, word_len, emb_size)
            char_embeddings = tf.reduce_max(char_embeddings, axis=1)
            char_embeddings = tf.reshape(char_embeddings, [-1, seq_len, self.char_emb_size])

        return char_embeddings

class EncoderBlock(object):
    """Module for an encoder block, which uses convolution and self-attention to encode a sequence.
    Based on the paper "Fast and Accurate Reading Comprehension" by Yu et al.

    Each block consists of [CONV x *] + [SELF-ATTN] + [FEED-FWD], where each sublayer in brackets
    is a layer-norm residual block mapping x to sublayer(layer_norm(x) + x).
    We apply layer normalization at the pre-processing step of each layer.
    We apply dropout, a residual connection, and stochastic depth dropout at the post-processing step of each sublayer.
    def __init__(self, num_blocks, keep_prob, kernel_size, d_model, num_conv_layers, num_heads, d_ff, l2_lambda=3e-7):
        self.num_blocks = num_blocks
        self.keep_prob = keep_prob
        self.kernel_size = kernel_size
        self.d_model = d_model
        self.num_conv_layers = num_conv_layers
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.l2_lambda = l2_lambda
        self.num_sublayers = self.num_blocks * (self.num_conv_layers + 2)

    def _sublayer_pre_process(layer_inputs, reuse=None):
        """Perform sublayer pre-processing steps. We only apply layer_norm.

        A note from Google's tensor2tensor repo:
        "The current settings ("", "dan") are the published version
        of the transformer.  ("n", "da") seems better for harder-to-learn
        models, so it should probably be the default."
        return tf_layers.layer_norm(layer_inputs, scope="LayerNorm", reuse=reuse)

    def _sublayer_post_process(self, layer_inputs, layer_outputs, sublayer_id, use_dropout=True):
        """Perform sublayer pre-processing steps. We apply dropout and residual connection,
        followed by stochastic depth dropout.

        A note from Google's tensor2tensor repo:
        "The current settings ("", "dan") are the published version
        of the transformer.  ("n", "da") seems better for harder-to-learn
        models, so it should probably be the default."
        if use_dropout:
            layer_outputs = tf.nn.dropout(layer_outputs, self.keep_prob)
        layer_outputs += layer_inputs
        return self._stochastic_depth_dropout(layer_inputs, layer_outputs, sublayer_id)

    def _stochastic_depth_dropout(self, sublayer_inputs, sublayer_outputs, sublayer_id):
        """Perform stochastic depth dropout on a sublayer. Earlier layers are more likely to survive.

        Sublayer l survives with probability 1 - (l/L) * (1 - p_L), where L is the total
        number of sublayers and p_L = self.keep_prob (defaults to 0.9).
        If the sublayer survives, returns layer_outputs. Else returns layer_inputs.
        assert 0 < sublayer_id <= self.num_sublayers, "Invalid sublayer ID: {}.".format(sublayer_id)
        # Earlier layers are more likely to be kept than later layers.
        keep_prob = 1. - float(sublayer_id) / float(self.num_sublayers) * (1. - self.keep_prob)
        do_keep = tf.random_uniform([]) < keep_prob
        return tf.cond(do_keep, lambda: sublayer_outputs, lambda: sublayer_inputs)

    def _build_positional_encoding_sublayer(self, inputs, seq_len, scope="PositionalEncoding"):
        """Add positional encoding to the inputs.

        Based on the paper "Attention is all you need" by Vaswani et al.
        Code adapted from Google's Tensor2Tensor repo on GitHub.
        with tf.variable_scope(scope):
            vec_size = inputs.get_shape().as_list()[-1]
            positions = tf.expand_dims(tf.to_float(tf.range(seq_len)), 1)  # shape (seq_len, 1)
            d_model = vec_size // 2
            exponent_step = math_log(10000.) / (tf.to_float(d_model) - 1)
            pos_multiplier = tf.exp(tf.to_float(tf.range(d_model)) * -exponent_step)
            pos_encoded = positions * tf.expand_dims(pos_multiplier, 0)  # shape (seq_len, d_model)
            pos_encoded = tf.concat([tf.sin(pos_encoded), tf.cos(pos_encoded)], axis=1)
            pos_encoded = tf.pad(pos_encoded, [[0, 0], [0, tf.mod(vec_size, 2)]])
            pos_encoded = tf.reshape(pos_encoded, [1, seq_len, vec_size])
            outputs = inputs + pos_encoded
            outputs = tf.nn.dropout(outputs, self.keep_prob)

        return outputs

    def _ds_conv(inputs, num_filters, kernel_size, l2_lambda=3e-7, scope="DSConv", reuse=None):
        """Depthwise-separable 1D convolution.
        Based on the paper "Xception: Deep Learning with Depthwise Separable Convolutions" by Francois Chollet.

          inputs: tensor. Rank 3, will be expanded along dimension 2 then squeezed back.
          num_filters: int. Number of filters to use in convolution.
          kernel_size: int. Size of kernel to use in convolution.
          l2_lambda: float. L2 regularization factor for filters.
        with tf.variable_scope(scope, reuse=reuse):
            vec_size = inputs.get_shape().as_list()[-1]
            # Depth-wise filter. Use He initializer because a ReLU activation follows immediately.
            d_filter = tf.get_variable("d_filter",
                                       shape=(kernel_size, 1, vec_size, 1),
            # Point-wise filter. Use He initializer because we use ReLU activation.
            p_filter = tf.get_variable("p_filter",
                                       shape=(1, 1, vec_size, num_filters),
            # Expand dims so we can use the TensorFlow separable Conv2D implementation.
            # Note: Standard tf.nn.conv1D does an analogous thing, reshaping its inputs and calling tf.nn.conv2D.
            inputs = tf.expand_dims(inputs, axis=2)
            outputs = tf.nn.separable_conv2d(inputs, d_filter, p_filter, strides=(1, 1, 1, 1), padding="SAME")
            # Bias
            b = tf.get_variable("b", shape=(outputs.shape[-1],), initializer=tf.zeros_initializer())
            # Activation
            outputs = tf.nn.relu(outputs + b)
            outputs = tf.squeeze(outputs, axis=2)

        return outputs

    def _build_conv_sublayer(self, inputs, sublayer_id, scope=None, reuse=None):
        """Compute layer_norm(x + conv(x)), where conv is depthwise-separable convolution

          inputs: tensor. The input sequence to this sublayer. Shape (batch_size, seq_len, num_filters).
          sublayer_id: int. ID for this sublayer, used for stochastic depth dropout. Bounds: [1, self.num_sublayers].
          Tensor shape (batch_size, seq_len, num_filters). Result of applying the sublayer operations.
        scope = scope or "ConvSublayer{}".format(sublayer_id)
        with tf.variable_scope(scope, reuse=reuse):
            outputs = self._sublayer_pre_process(inputs, reuse=reuse)
            outputs = self._ds_conv(outputs, self.d_model, self.kernel_size, self.l2_lambda, reuse=reuse)

        return self._sublayer_post_process(inputs, outputs, sublayer_id)

    def _build_self_attn_sublayer(self, inputs, seq_len, mask, sublayer_id, scope="SelfAttnSublayer", reuse=None):
        """Compute self_attn(layer_norm(x)) + x, where self_attn is multi-head self-attention
        as described in Vaswani et al., 2017.

          inputs: tensor. The input sequence to this sublayer. Shape (batch_size, seq_len, num_filters).
          sublayer_id: int. ID for this sublayer, used for stochastic depth dropout. Bounds: [1, self.num_sublayers].
          Tensor shape (batch_size, seq_len, num_filters). Result of applying the sublayer operations.
        with tf.variable_scope(scope, reuse=reuse):
            outputs = self._sublayer_pre_process(inputs, reuse=reuse)
            attn = MultiHeadAttn(self.num_heads, self.d_model, l2_lambda=self.l2_lambda)
            outputs = attn.build_graph(outputs, outputs, outputs, seq_len, mask, reuse=reuse)

        return self._sublayer_post_process(inputs, outputs, sublayer_id)

    def _build_feed_fwd_sublayer(self, inputs, sublayer_id, scope="FeedForwardSublayer", reuse=None):
        """Compute feed_fwd(layer_norm(x)) + x, where feed_fwd is a feed-forward of two 1x1 conv layers.

          inputs: tensor. The input sequence to this sublayer. Shape (batch_size, seq_len, num_filters).
          sublayer_id: int. ID for this sublayer, used for stochastic depth dropout. Bounds: [1, self.num_sublayers].
          Tensor shape (batch_size, seq_len, num_filters). Result of applying the sublayer operations.
        with tf.variable_scope(scope, reuse=reuse):
            outputs = self._sublayer_pre_process(inputs, reuse=reuse)
            outputs = std_conv(outputs, self.d_ff, activation_fn=tf.nn.relu, scope="Conv1", reuse=reuse)
            outputs = std_conv(outputs, self.d_model, activation_fn=None, scope="Conv2", reuse=reuse)

        return self._sublayer_post_process(inputs, outputs, sublayer_id)

    def _build_input_reduction_sublayer(self, inputs, scope="ReduceInputDim", reuse=None):
        """Project inputs down to dimension d_model. Unlike other sublayers, does not have a
        residual connection, since inputs and outputs have different dimension.
        with tf.variable_scope(scope, reuse=reuse):
            outputs = std_conv(inputs, self.d_model, scope="Conv", reuse=reuse)

        return outputs

    def build_graph(self, inputs, seq_len, mask, reduce_input_dim=False, scope="EncoderBlock", reuse=None):
        Build the compute graph for an EncoderBlock.

        EncoderBlocks are described in Yu et al., 2018 (https://openreview.net/forum?id=B14TlG-RW).
        An EncoderBlock consists of n blocks, where each block consists of:
          * m sublayers: depthwise-separable convolution
          * 1 sublayer: self-attention using multi-head attention (cf. Vaswani et al., 2017).
          * 1 sublayer: feed forward network-in-network (two standard convolutional layers with filter size of 1).
        Each sublayer computes sublayer(layer_norm(x)) + x, and we perform stochastic depth dropout on each sublayer.

          inputs: tensor. The input sequence to this EncoderBlock. Shape (batch_size, seq_len, vec_size).
          seq_len: tensor. Maximal length of each sequence. Shape ().
          mask: tensor. Mask for the sequence. Shape (batch_size,).
          reduce_input_dim: bool. If true, project the last input dimension down from vec_size to d_model.
            tensor. Output of this EncoderBlock. Shape (batch_size, seq_len, d_model).
        with tf.variable_scope(scope, reuse=reuse):
            # Reduce input dimension (happens only once per EncoderBlock)
            if reduce_input_dim:
                outputs = self._build_input_reduction_sublayer(inputs)
                outputs = inputs

            # Keep track of sublayer ID for computing stochastic depth dropout probability
            sublayer_id = 1
            for block_id in range(self.num_blocks):
                with tf.variable_scope("Block{}".format(block_id + 1), reuse=reuse):
                    # Add positional encoding
                    outputs = self._build_positional_encoding_sublayer(outputs, seq_len)

                    # Add convolution sublayers
                    for _ in range(self.num_conv_layers):
                        outputs = self._build_conv_sublayer(outputs, sublayer_id, reuse=reuse)
                        sublayer_id += 1

                    # Add self-attention sublayer
                    outputs = self._build_self_attn_sublayer(outputs, seq_len, mask, sublayer_id, reuse=reuse)
                    sublayer_id += 1

                    # Add feed-forward sublayer
                    outputs = self._build_feed_fwd_sublayer(outputs, sublayer_id, reuse=reuse)
                    sublayer_id += 1

        return outputs

class HighwayEncoder(object):
    Encode an input sequence using a highway network.

    Based on the paper "Highway Networks" by Srivastava et al.
    def __init__(self, num_layers, keep_prob, l2_lambda=3e-7):
        self.num_layers = num_layers
        self.keep_prob = keep_prob
        self.l2_lambda = l2_lambda

    def build_graph(self, inputs, scope="HighwayEncoder", reuse=None):
        Build a highway network with the number of layers specified in the initializer.

          inputs: Tensor shape (batch_size, seq_len, vec_size)
        Return: Tensor shape (batch_size, seq_len, vec_size) after passing through
          num_layers highway network layers.
        with tf.variable_scope(scope, reuse=reuse):
            outputs = inputs
            for l in range(self.num_layers):
                # Each layer computes [carry * transform + (1-carry) * inputs]
                with tf.variable_scope("Layer{}".format(l+1), reuse=reuse):
                    vec_size = inputs.get_shape().as_list()[-1]

                    # Compute non-linear transform with 1x1 feed-forward Conv layer and ReLU activation
                    h = std_conv(outputs, vec_size, activation_fn=tf.nn.relu, scope="NonLinearTransform", reuse=reuse)

                    # Compute transform gate with 1x1 feed-forward Conv layer and sigmoid activation
                    t = std_conv(outputs, vec_size, activation_fn=tf.nn.sigmoid, scope="TransformGate", reuse=reuse)

                    # Combine non-linear transformation with the inputs, gated by the transform gate (we set C = 1-T).
                    outputs = h * t + outputs * (1. - t)

                    outputs = tf.nn.dropout(outputs, self.keep_prob)

        return outputs

class ScaledDotProductAttn(object):
    """Module for scaled dot-product attention.
    Based on the attention mechanism described in the paper "Attention Is All You Need" by Vaswani et al., 2017.

    Terminology: "Map[s] a query and a set of key-value pairs to an output, where the query, keys, values, and output
    are all vectors. The output is computed as a weighted sum of the values, where the weight assigned to each value
    is computed by a compatibility function of the query with the corresponding key."
    def __init__(self):

    def build_graph(queries, keys, values, mask):
        """Compute scaled dot-product attention, a weighted sum of the values, where the weight
        assigned to each value is computed by a compatibility function f the query with the corresponding key.

          queries: tensor. Shape (batch_size, num_heads, max_seq_len, d_k).
          keys: tensor. Shape (batch_size, num_heads, max_seq_len, d_k).
          values: tensor. Shape (batch_size, num_heads, max_seq_len, d_v).
          mask: tensor. Shape (batch_size, max_seq_len).
          tensor. Weighted sum of the values, where each weight is computed by a compatibility function
          of a query vector and the key corresponding to a value. Shape matches that of values.
        d_k = tf.shape(keys)[-1]
        # Compute QK^T
        qk_t = tf.matmul(queries, keys, transpose_b=True)  # Shape (bs, num_heads, max_seq_len, max_seq_len)
        # Scale by 1/sqrt(d_k) to give the dot products unit variance (hence the name "scaled dot-product")
        qk_t = qk_t / tf.sqrt(tf.cast(d_k, tf.float32))
        # Expand mask from shape (batch_size, max_seq_len) to (batch_size, 1, 1, max_seq_len).
        # We want to give out-of-range values 0 weight, and we sum over the last dimension for the softmax.
        mask = tf.expand_dims(mask, axis=1)
        mask = tf.expand_dims(mask, axis=2)
        # Compute the weights with softmax.
        _, weights = masked_softmax(qk_t, mask, -1)
        # Compute weighted sum of the values
        attn_outputs = tf.matmul(weights, values)

        return attn_outputs

class MultiHeadAttn(object):
    """Module for multi-head attention.
    Based on the attention mechanism described in the paper "Attention Is All You Need" by Vaswani et al., 2017.

    Calls the ScaledDotProductAttn module in parallel over a number of heads.
    def __init__(self, num_heads, d_model, l2_lambda=3e-7):
        assert d_model % num_heads == 0, "MultiHeadAttn: d_model must be divisible by num_heads"
        self.num_heads = num_heads
        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.d_v = d_model // num_heads
        self.l2_lambda = l2_lambda

    def _project(self, q, k, v, scope="Linearity", reuse=None):
        """Project queries, keys, values with a linear layer.

        Note: We project the inputs for q, k, v *before* splitting to prepare inputs for each head.
        This differs from the order in "Attention Is All You Need," but is functionally equivalent.
        def _project_one(x, d, inner_scope):
            return tf_layers.fully_connected(x, d, activation_fn=None, biases_initializer=None,
                                             scope=inner_scope, reuse=reuse)

        with tf.variable_scope(scope, reuse=reuse):
            q_projected = _project_one(q, self.d_model, "q")
            k_projected = _project_one(k, self.d_model, "k")
            v_projected = _project_one(v, self.d_model, "v")

        return q_projected, k_projected, v_projected

    def _split(self, q, k, v, seq_len):
        """Split queries, keys, values into pieces to prepare for multi-head scaled dot product.
        def _split_one(x, d):
            x = tf.reshape(x, [-1, seq_len, self.num_heads, d])
            x = tf.transpose(x, [0, 2, 1, 3])  # Shape: (batch_size, num_heads, seq_len, d_x)
            return x

        q_split, k_split, v_split = _split_one(q, self.d_k), _split_one(k, self.d_k), _split_one(v, self.d_v)
        return q_split, k_split, v_split

    def _concat(self, attn_outputs, seq_len):
        """Concatenate heads of attention outputs.

        This happens at the end, after each head has performed scaled dot-product attention.
        attn_outputs = tf.transpose(attn_outputs, [0, 2, 1, 3])
        attn_outputs = tf.reshape(attn_outputs, [-1, seq_len, self.num_heads * self.d_v])
        return attn_outputs

    def build_graph(self, q, k, v, seq_len, mask, scope="MultiHeadAttn", reuse=None):
        with tf.variable_scope(scope, reuse=reuse):
            # Project each of q, k, v linearly
            q, k, v = self._project(q, k, v, reuse=reuse)
            # Split each of q, k, v to prepare for scaled dot product in parallel
            q, k, v = self._split(q, k, v, seq_len)
            # Perform scaled dot-product attention on q, k, v
            sdp_attn = ScaledDotProductAttn()
            attn_outputs = sdp_attn.build_graph(q, k, v, mask)
            # Merge the outputs of each head
            attn_outputs = self._concat(attn_outputs, seq_len)
            # Linear transform to project back to model dimension
            attn_outputs = tf_layers.fully_connected(attn_outputs,

        return attn_outputs

class SimpleSoftmaxLayer(object):
    Module to take set of hidden states, (e.g. one for each context location),
    and return probability distribution over those states.

    def __init__(self, l2_lambda=3e-7):
        self.l2_lambda = l2_lambda

    def build_graph(self, inputs, masks, scope="Softmax", reuse=None):
        Applies 1D convolutional down-projection layer, then softmax.

          inputs: Tensor shape (batch_size, seq_len, hidden_size)
          masks: Tensor shape (batch_size, seq_len)
            Has 1s where there is real input, 0s where there's padding.

          logits: Tensor shape (batch_size, seq_len).
            logits is the result of the down-projection layer, but it has -1e30
            (i.e. very large negative number) in the padded locations
          prob_dist: Tensor shape (batch_size, seq_len)
            The result of taking softmax over logits.
            This should have 0 in the padded locations, and the rest should sum to 1.
        with tf.variable_scope(scope, reuse=reuse):

            # Convolutional down-projection layer
            logits = std_conv(inputs, 1, l2_lambda=self.l2_lambda, use_bias=False, scope="Logits", reuse=reuse)
            logits = tf.squeeze(logits, axis=2)  # shape: (batch_size, seq_len)

            # Take softmax over sequence
            masked_logits, prob_dist = masked_softmax(logits, masks, 1)

            return masked_logits, prob_dist

def masked_softmax(logits, mask, dim):
    Takes masked softmax over given dimension of logits.

      logits: Numpy array. We want to take softmax over dimension dim.
      mask: Numpy array of same shape as logits.
        Has 1s where there's real data in logits, 0 where there's padding
      dim: int. dimension over which to take softmax

      masked_logits: Numpy array same shape as logits.
        This is the same as logits, but with 1e30 subtracted
        (i.e. very large negative number) in the padding locations.
      prob_dist: Numpy array same shape as logits.
        The result of taking softmax over masked_logits in given dimension.
        Should be 0 in padding locations.
        Should sum to 1 over given dimension.
    exp_mask = (1 - tf.cast(mask, 'float')) * (-1e30)  # -large where there's padding, 0 elsewhere
    masked_logits = tf.add(logits, exp_mask)  # where there's padding, set logits to -large
    prob_dist = tf.nn.softmax(masked_logits, dim)
    return masked_logits, prob_dist

def std_conv(inputs, num_filters, kernel_size=1, padding="SAME", activation_fn=None, l2_lambda=3e-7, use_bias=True,
             scope="Conv", reuse=None):
    """Standard 1D convolution, using SAME padding.

      inputs: tensor. Input to the 1D conv layer. Shape (batch_size, seq_len, vec_size).
      num_filters: int. Depth of filter stack to use in 1D conv.
      kernel_size: int. Spatial extent of 1D kernel (i.e., number of timesteps the kernel covers per application).
      padding: string. Padding to use for 1D convolution. Defaults to "SAME".
      activation_fn: function. Activation function to apply to outputs before returning. If None, no activation.
      l2_lambda: float. L2 regularization factor to apply to the kernel weights.
      use_bias: bool. If true, apply a bias to the convolution outputs. Else, no bias.
      outputs: tensor. Outputs after convolution, bias (if any), and activation (if any) are applied.
      Shape (batch_size, out_seq_len, num_filters), where out_seq_len depends on the padding.
    with tf.variable_scope(scope, reuse=reuse):
        vec_size = inputs.get_shape()[-1]
        # Use Xavier initializer if no activation, otherwise use He.
        initializer = tf_layers.xavier_initializer if activation_fn is None else tf_layers.variance_scaling_initializer
        filters = tf.get_variable("filters",
                                  shape=(kernel_size, vec_size, num_filters),
        outputs = tf.nn.conv1d(inputs, filters, stride=1, padding=padding)
        if use_bias:
            b = tf.get_variable("b", shape=(num_filters,), dtype=tf.float32, initializer=tf.zeros_initializer())
            outputs += b

    return outputs if activation_fn is None else activation_fn(outputs)