Python tensorflow.matrix_band_part() Examples

The following are 30 code examples of tensorflow.matrix_band_part(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: transformer_attentions.py    From Counterfactual-StoryRW with MIT License 6 votes vote down vote up
def _ones_matrix_band_part(rows, cols, num_lower, num_upper,
    out_shape=None):
    """Matrix band part of ones.
    """
    if all([isinstance(el, int) for el in [rows, cols, num_lower,
        num_upper]]):
    # Needed info is constant, so we construct in numpy
        if num_lower < 0:
            num_lower = rows - 1
        if num_upper < 0:
            num_upper = cols - 1
        lower_mask = np.tri(cols, rows, num_lower).T
        upper_mask = np.tri(rows, cols, num_upper)
        band = np.ones((rows, cols)) * lower_mask * upper_mask
        if out_shape:
            band = band.reshape(out_shape)
        band = tf.constant(band, tf.float32)
    else:
        band = tf.matrix_band_part(tf.ones([rows, cols]),
                                   tf.cast(num_lower, tf.int64),
                                   tf.cast(num_upper, tf.int64))
        if out_shape:
            band = tf.reshape(band, out_shape)
    return band 
Example #2
Source File: vgp.py    From VFF with Apache License 2.0 6 votes vote down vote up
def _build_predict_train(self):
        Kuf = self._Kuf
        Kuu = [make_Kuu(kern, a, b, self.ms) for kern, a, b, in zip(self.kerns, self.a, self.b)]
        KiKuf = [Kuu_d.solve(Kuf_d) for Kuu_d, Kuf_d in zip(Kuu, Kuf)]
        KfuKi = [tf.transpose(mat) for mat in KiKuf]

        mu = kvs_dot_vec(KfuKi, self.q_mu)
        L = tf.matrix_band_part(self.q_sqrt, -1, 0)
        tmp1 = kvs_dot_mat(KfuKi, L, num_cols=np.prod(self.Ms))

        # Kff:
        var = reduce(tf.multiply, [k.Kdiag(self.X[:, i:i+1]) for i, k in enumerate(self.kerns)])
        # Projected variance Kfu Ki [WWT] Ki Kuf
        # var = var + reduce(tf.multiply, [tf.reduce_sum(tf.square(tmp1_d), 0) for tmp1_d in tmp1])
        var = var + tf.reduce_sum(tf.square(tmp1), 1)
        # Qff
        var = var - reduce(tf.multiply, [tf.reduce_sum(Kuf_d * KiKuf_d, 0) for Kuf_d, KiKuf_d in zip(Kuf, KiKuf)])
        var = tf.reshape(var, (-1, 1))

        return mu, var 
Example #3
Source File: utils.py    From OpenSeq2Seq with Apache License 2.0 6 votes vote down vote up
def get_decoder_self_attention_bias(length, dtype=tf.float32):
  """Calculate bias for decoder that maintains model's autoregressive property.

  Creates a tensor that masks out locations that correspond to illegal
  connections, so prediction at position i cannot draw information from future
  positions.

  Args:
    length: int length of sequences in batch.

  Returns:
    float tensor of shape [1, 1, length, length]
  """
  #print("get_decoder_self_attention_bias", dtype)

  with tf.name_scope("decoder_self_attention_bias"):
    #valid_locs = tf.matrix_band_part(tf.ones([length, length], dtype=dtype), -1, 0)
    valid_locs = tf.matrix_band_part(tf.ones([length, length], dtype=tf.float32), -1, 0)
    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
    neg_inf=_NEG_INF #if (dtype==tf.float32) else _NEG_INF_FP16
    bias = neg_inf * (1.0 - valid_locs)
    #bias=tf.saturate_cast(bias, dtype=dtype)
  return bias 
Example #4
Source File: matrix_band_part_op_test.py    From deep_image_model with Apache License 2.0 6 votes vote down vote up
def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):

  def Test(self):
    mat = np.ones(shape_).astype(dtype_)
    batch_mat = np.tile(mat, batch_shape + (1, 1))
    with self.test_session(use_gpu=True):
      for lower in -1, 0, 1, shape_[-2] - 1:
        for upper in -1, 0, 1, shape_[-1] - 1:
          band_np = mat
          if lower >= 0:
            band_np = np.triu(band_np, -lower)
          if upper >= 0:
            band_np = np.tril(band_np, upper)
          if batch_shape is not ():
            band_np = np.tile(band_np, batch_shape + (1, 1))
          band = tf.matrix_band_part(batch_mat, lower, upper)
          self.assertAllEqual(band_np, band.eval())

  return Test 
Example #5
Source File: transformer_attentions.py    From texar with Apache License 2.0 6 votes vote down vote up
def _ones_matrix_band_part(rows, cols, num_lower, num_upper,
    out_shape=None):
    """Matrix band part of ones.
    """
    if all([isinstance(el, int) for el in [rows, cols, num_lower,
        num_upper]]):
        # Needed info is constant, so we construct in numpy
        if num_lower < 0:
            num_lower = rows - 1
        if num_upper < 0:
            num_upper = cols - 1
        lower_mask = np.tri(cols, rows, num_lower).T
        upper_mask = np.tri(rows, cols, num_upper)
        band = np.ones((rows, cols)) * lower_mask * upper_mask
        if out_shape:
            band = band.reshape(out_shape)
        band = tf.constant(band, tf.float32)
    else:
        band = tf.matrix_band_part(tf.ones([rows, cols]),
                                   tf.cast(num_lower, tf.int64),
                                   tf.cast(num_upper, tf.int64))
        if out_shape:
            band = tf.reshape(band, out_shape)
    return band 
Example #6
Source File: model_utils.py    From models with Apache License 2.0 6 votes vote down vote up
def get_decoder_self_attention_bias(length):
  """Calculate bias for decoder that maintains model's autoregressive property.

  Creates a tensor that masks out locations that correspond to illegal
  connections, so prediction at position i cannot draw information from future
  positions.

  Args:
    length: int length of sequences in batch.

  Returns:
    float tensor of shape [1, 1, length, length]
  """
  with tf.name_scope("decoder_self_attention_bias"):
    valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
    decoder_bias = _NEG_INF * (1.0 - valid_locs)
  return decoder_bias 
Example #7
Source File: model_utils.py    From models with Apache License 2.0 6 votes vote down vote up
def get_decoder_self_attention_bias(length):
  """Calculate bias for decoder that maintains model's autoregressive property.

  Creates a tensor that masks out locations that correspond to illegal
  connections, so prediction at position i cannot draw information from future
  positions.

  Args:
    length: int length of sequences in batch.

  Returns:
    float tensor of shape [1, 1, length, length]
  """
  with tf.name_scope("decoder_self_attention_bias"):
    valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
    decoder_bias = _NEG_INF * (1.0 - valid_locs)
  return decoder_bias 
Example #8
Source File: model_utils.py    From models with Apache License 2.0 6 votes vote down vote up
def get_decoder_self_attention_bias(length):
  """Calculate bias for decoder that maintains model's autoregressive property.

  Creates a tensor that masks out locations that correspond to illegal
  connections, so prediction at position i cannot draw information from future
  positions.

  Args:
    length: int length of sequences in batch.

  Returns:
    float tensor of shape [1, 1, length, length]
  """
  with tf.name_scope("decoder_self_attention_bias"):
    valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
    decoder_bias = _NEG_INF * (1.0 - valid_locs)
  return decoder_bias 
Example #9
Source File: model_utils.py    From models with Apache License 2.0 6 votes vote down vote up
def get_decoder_self_attention_bias(length):
  """Calculate bias for decoder that maintains model's autoregressive property.

  Creates a tensor that masks out locations that correspond to illegal
  connections, so prediction at position i cannot draw information from future
  positions.

  Args:
    length: int length of sequences in batch.

  Returns:
    float tensor of shape [1, 1, length, length]
  """
  with tf.name_scope("decoder_self_attention_bias"):
    valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
    decoder_bias = _NEG_INF * (1.0 - valid_locs)
  return decoder_bias 
Example #10
Source File: common_layers.py    From fine-lm with MIT License 6 votes vote down vote up
def ones_matrix_band_part(rows, cols, num_lower, num_upper, out_shape=None):
  """Matrix band part of ones."""
  if all([isinstance(el, int) for el in [rows, cols, num_lower, num_upper]]):
    # Needed info is constant, so we construct in numpy
    if num_lower < 0:
      num_lower = rows - 1
    if num_upper < 0:
      num_upper = cols - 1
    lower_mask = np.tri(cols, rows, num_lower).T
    upper_mask = np.tri(rows, cols, num_upper)
    band = np.ones((rows, cols)) * lower_mask * upper_mask
    if out_shape:
      band = band.reshape(out_shape)
    band = tf.constant(band, tf.float32)
  else:
    band = tf.matrix_band_part(
        tf.ones([rows, cols]), tf.cast(num_lower, tf.int64),
        tf.cast(num_upper, tf.int64))
    if out_shape:
      band = tf.reshape(band, out_shape)

  return band 
Example #11
Source File: common_layers.py    From ASR with Apache License 2.0 5 votes vote down vote up
def mask_leq(target_length, source_length):
  """A mask with 1.0 wherever source_pos <= target_pos and 0.0 elsewhere.

  Args:
    target_length: an integer
    source_length: an integer
  Returns:
    a Tensor with shape [1, target_length, source_length]
  """
  return tf.expand_dims(
      tf.matrix_band_part(tf.ones([target_length, source_length]), -1, 0), 0) 
Example #12
Source File: layers.py    From NeuralEDUSeg with Apache License 2.0 5 votes vote down vote up
def self_attention(inputs, lengths, window_size=-1, scope='bilinear_attention', reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # logits = tf.matmul(inputs, inputs, transpose_b=True)  # Q * K
        logits = trilinear_similarity(inputs, inputs)
        mask = tf.sequence_mask(lengths, tf.shape(inputs)[1], tf.float32)
        mask = tf.expand_dims(mask, 1)
        if window_size > 0:
            restricted_mask = tf.matrix_band_part(tf.ones_like(logits, dtype=tf.float32), window_size, window_size)
            mask = mask * restricted_mask
        logits = mask_logits(logits, mask)
        weights = tf.nn.softmax(logits, name='attn_weights')
        return tf.matmul(weights, inputs), weights 
Example #13
Source File: xlnet_encoder.py    From texar with Apache License 2.0 5 votes vote down vote up
def _create_mask(self, qlen, mlen, dtype=tf.float32, same_length=False):
        r"""Create causal attention mask."""
        attn_mask = tf.ones([qlen, qlen], dtype=dtype)
        mask_u = tf.matrix_band_part(attn_mask, 0, -1)
        mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
        attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], axis=1)
        if same_length:
            mask_l = tf.matrix_band_part(attn_mask, -1, 0)
            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]],
                            axis=1)

        return ret 
Example #14
Source File: continuous_actions.py    From tensorflow-rl with Apache License 2.0 5 votes vote down vote up
def _build_q_head(self, input_state):
        self.w_value, self.b_value, self.value = layers.fc('fc_value', input_state, 1, activation='linear')
        self.w_L, self.b_L, self.L_full = layers.fc('L_full', input_state, self.num_actions, activation='linear')
        self.w_mu, self.b_mu, self.mu = layers.fc('mu', input_state, self.num_actions, activation='linear')

        #elements above the main diagonal in L_full are unused
        D = tf.matrix_band_part(tf.exp(self.L_full) - L_full, 0, 0)
        L = tf.matrix_band_part(L_full, -1, 0) + D

        LT_u_minus_mu = tf.einsum('ikj,ik', L, self.selected_action_ph  - self.mu)
        self.advantage = tf.einsum('ijk,ikj->i', LT_u_minus_mu, LT_u_minus_mu)

        q_selected_action = self.value + self.advantage
        diff = tf.subtract(self.target_ph, q_selected_action)
        return self._value_function_loss(diff) 
Example #15
Source File: common_attention.py    From ASR with Apache License 2.0 5 votes vote down vote up
def attention_bias_lower_triangle(length):
    """Create an bias tensor to be added to attention logits.
  
    Args:
     length: a Scalar.
  
    Returns:
      a `Tensor` with shape [1, 1, length, length].
    """
    lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    ret = -1e9 * (1.0 - lower_triangle)
    return tf.reshape(ret, [1, 1, length, length]) 
Example #16
Source File: model.py    From galois-autocompleter with MIT License 5 votes vote down vote up
def attention_mask(nd, ns, *, dtype):
    """1's in the lower triangle, counting from the lower right corner.

    Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
    """
    i = tf.range(nd)[:,None]
    j = tf.range(ns)
    m = i >= j - ns + nd
    return tf.cast(m, dtype) 
Example #17
Source File: common_layers.py    From NMT_GAN with Apache License 2.0 5 votes vote down vote up
def mask_leq(target_length, source_length):
  """A mask with 1.0 wherever source_pos <= target_pos and 0.0 elsewhere.

  Args:
    target_length: an integer
    source_length: an integer
  Returns:
    a Tensor with shape [1, target_length, source_length]
  """
  return tf.expand_dims(
      tf.matrix_band_part(tf.ones([target_length, source_length]), -1, 0), 0) 
Example #18
Source File: attention.py    From Document-Transformer with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def attention_bias(inputs, mode, inf=-1e9, name=None):
    """ A bias tensor used in attention mechanism
    :param inputs: A tensor
    :param mode: one of "causal", "masking", "proximal" or "distance"
    :param inf: A floating value
    :param name: optional string
    :returns: A 4D tensor with shape [batch, heads, queries, memories]
    """

    with tf.name_scope(name, default_name="attention_bias", values=[inputs]):
        if mode == "causal":
            length = inputs
            lower_triangle = tf.matrix_band_part(
                tf.ones([length, length]), -1, 0
            )
            ret = inf * (1.0 - lower_triangle)
            return tf.reshape(ret, [1, 1, length, length])
        elif mode == "masking":
            mask = inputs
            ret = (1.0 - mask) * inf
            return tf.expand_dims(tf.expand_dims(ret, 1), 1)
        elif mode == "proximal":
            length = inputs
            r = tf.to_float(tf.range(length))
            diff = tf.expand_dims(r, 0) - tf.expand_dims(r, 1)
            m = tf.expand_dims(tf.expand_dims(-tf.log(1 + tf.abs(diff)), 0), 0)
            return m
        elif mode == "distance":
            length, distance = inputs
            distance = tf.where(distance > length, 0, distance)
            distance = tf.cast(distance, tf.int64)
            lower_triangle = tf.matrix_band_part(
                tf.ones([length, length]), -1, 0
            )
            mask_triangle = 1.0 - tf.matrix_band_part(
                tf.ones([length, length]), distance - 1, 0
            )
            ret = inf * (1.0 - lower_triangle + mask_triangle)
            return tf.reshape(ret, [1, 1, length, length])
        else:
            raise ValueError("Unknown mode %s" % mode) 
Example #19
Source File: transformer.py    From deep_dialog_tutorial with MIT License 5 votes vote down vote up
def _create_dec_self_attention_mask(self, decoder_input: tf.Tensor):
        with tf.name_scope('dec_self_attention_mask'):
            batch_size, length = tf.unstack(tf.shape(decoder_input))
            pad_array = tf.equal(decoder_input, PAD_ID)  # [batch_size, m_length]
            pad_array = tf.reshape(pad_array, [batch_size, 1, 1, length])

            autoregression_array = tf.logical_not(
                tf.matrix_band_part(tf.ones([length, length], dtype=tf.bool), -1, 0))  # 下三角が False
            autoregression_array = tf.reshape(autoregression_array, [1, 1, length, length])

            return tf.logical_or(pad_array, autoregression_array) 
Example #20
Source File: vgp.py    From VFF with Apache License 2.0 5 votes vote down vote up
def _build_predict(self, X, full_cov=False):
        # given self.q(v), compute q(f)

        Kuf = [make_Kuf(k, X[:, i:i+1], a, b, self.ms) for i, (k, a, b) in enumerate(zip(self.kerns, self.a, self.b))]
        Kuu = [make_Kuu(kern, a, b, self.ms) for kern, a, b, in zip(self.kerns, self.a, self.b)]
        KiKuf = [Kuu_d.solve(Kuf_d) for Kuu_d, Kuf_d in zip(Kuu, Kuf)]
        KfuKi = [tf.transpose(mat) for mat in KiKuf]

        mu = kvs_dot_vec(KfuKi, self.q_mu)

        L = tf.matrix_band_part(self.q_sqrt, -1, 0)
        tmp1 = kvs_dot_mat(KfuKi, L, np.prod(self.Ms))

        if full_cov:
            raise NotImplementedError
        else:
            # Kff:
            var = reduce(tf.multiply, [k.Kdiag(X[:, i:i+1]) for i, k in enumerate(self.kerns)])

            # Projected variance Kfu Ki [WWT] Ki Kuf
            # var = var + reduce(tf.multiply, [tf.reduce_sum(tf.square(tmp1_d), 0) for tmp1_d in tmp1])
            var = var + tf.reduce_sum(tf.square(tmp1), 1)

            # Qff
            var = var - reduce(tf.multiply, [tf.reduce_sum(Kuf_d * KiKuf_d, 0) for Kuf_d, KiKuf_d in zip(Kuf, KiKuf)])

            var = tf.reshape(var, (-1, 1))

        return mu, var 
Example #21
Source File: vgp.py    From VFF with Apache License 2.0 5 votes vote down vote up
def _build_predict_train(self):
        Kuf = self._Kuf

        Kuu = [make_Kuu(kern, a, b, self.ms) for kern, a, b, in zip(self.kerns, self.a, self.b)]
        KiKuf = [Kuu_d.solve(Kuf_d) for Kuu_d, Kuf_d in zip(Kuu, Kuf)]
        KfuKi = [tf.transpose(mat) for mat in KiKuf]

        mu = kvs_dot_vec(KfuKi, self.q_mu)

        # Kff:
        var = reduce(tf.multiply, [k.Kdiag(self.X[:, i:i+1]) for i, k in enumerate(self.kerns)])

        # Projected variance Kfu Ki [WWT] Ki Kuf
        Ls = [tf.matrix_band_part(q_sqrt_d, -1, 0) for q_sqrt_d in self.q_sqrt_kron]
        tmp = [tf.matmul(tf.transpose(L), KiKuf_d) for L, KiKuf_d in zip(Ls, KiKuf)]
        var = var + reduce(tf.multiply, [tf.reduce_sum(tf.square(tmp_d), 0) for tmp_d in tmp])

        if self.use_two_krons:
            Ls = [tf.matrix_band_part(q_sqrt_d, -1, 0) for q_sqrt_d in self.q_sqrt_kron_2]
            tmp = [tf.matmul(tf.transpose(L), KiKuf_d) for L, KiKuf_d in zip(Ls, KiKuf)]
            var = var + reduce(tf.multiply, [tf.reduce_sum(tf.square(tmp_d), 0) for tmp_d in tmp])
        elif self.use_extra_ranks:
            for i in range(self.use_extra_ranks):
                tmp = kvs_dot_vec(KfuKi, self.q_sqrt_W[:, i:i+1])
                var = var + tf.reduce_sum(tf.square(tmp), 1)

        # Qff
        var = var - reduce(tf.multiply, [tf.reduce_sum(Kuf_d * KiKuf_d, 0) for Kuf_d, KiKuf_d in zip(Kuf, KiKuf)])

        return mu, tf.reshape(var, [-1, 1]) 
Example #22
Source File: vgp.py    From VFF with Apache License 2.0 5 votes vote down vote up
def _build_predict(self, X, full_cov=False):
        # given self.q(v), compute q(f)

        Kuf = [make_Kuf(k, X[:, i:i+1], a, b, self.ms) for i, (k, a, b) in enumerate(zip(self.kerns, self.a, self.b))]
        Kuu = [make_Kuu(kern, a, b, self.ms) for kern, a, b, in zip(self.kerns, self.a, self.b)]
        KiKuf = [Kuu_d.solve(Kuf_d) for Kuu_d, Kuf_d in zip(Kuu, Kuf)]
        KfuKi = [tf.transpose(mat) for mat in KiKuf]

        mu = kvs_dot_vec(KfuKi, self.q_mu)

        if full_cov:
            raise NotImplementedError
        else:
            # Kff:
            var = reduce(tf.multiply, [k.Kdiag(X[:, i:i+1]) for i, k in enumerate(self.kerns)])

            # Projected variance Kfu Ki [WWT] Ki Kuf
            Ls = [tf.matrix_band_part(q_sqrt_d, -1, 0) for q_sqrt_d in self.q_sqrt_kron]
            tmp = [tf.matmul(tf.transpose(L), KiKuf_d) for L, KiKuf_d in zip(Ls, KiKuf)]
            var = var + reduce(tf.multiply, [tf.reduce_sum(tf.square(tmp_d), 0) for tmp_d in tmp])

            if self.use_two_krons:
                Ls = [tf.matrix_band_part(q_sqrt_d, -1, 0) for q_sqrt_d in self.q_sqrt_kron_2]
                tmp = [tf.matmul(tf.transpose(L), KiKuf_d) for L, KiKuf_d in zip(Ls, KiKuf)]
                var = var + reduce(tf.multiply, [tf.reduce_sum(tf.square(tmp_d), 0) for tmp_d in tmp])
            elif self.use_extra_ranks:
                for i in range(self.use_extra_ranks):
                    tmp = kvs_dot_vec(KfuKi, self.q_sqrt_W[:, i:i+1])
                    var = var + tf.reduce_sum(tf.square(tmp), 1)

            # Qff
            var = var - reduce(tf.multiply, [tf.reduce_sum(Kuf_d * KiKuf_d, 0) for Kuf_d, KiKuf_d in zip(Kuf, KiKuf)])

            var = tf.reshape(var, (-1, 1))

        return mu, var 
Example #23
Source File: common_attention.py    From NJUNMT-tf with Apache License 2.0 5 votes vote down vote up
def attention_bias_lower_triangle(length):
    """ Create a bias tensor to be added to attention logits.

      Allows a query to attend to all positions up to and including its own.
    Args:
        length: A scalar.

    Returns: A float Tensor of shape [1, 1, length, length], with -1e9 in
      padding positions and 0 in non-padding positions.

    """
    lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    ret = FLOAT_MIN * (1. - lower_triangle)
    return tf.reshape(ret, [1, 1, length, length]) 
Example #24
Source File: modeling_gpt2.py    From Decoders-Chinese-TF2.0 with MIT License 5 votes vote down vote up
def causal_attention_mask(nd, ns, dtype):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype) 
Example #25
Source File: modeling_tf_gpt2.py    From exbert with Apache License 2.0 5 votes vote down vote up
def causal_attention_mask(nd, ns, dtype):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype) 
Example #26
Source File: modeling_tf_openai.py    From exbert with Apache License 2.0 5 votes vote down vote up
def causal_attention_mask(nd, ns, dtype):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype) 
Example #27
Source File: modeling_tf_xlnet.py    From exbert with Apache License 2.0 5 votes vote down vote up
def create_mask(self, qlen, mlen, dtype=tf.float32):
        """
        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.

        Args:
            qlen: TODO Lysandre didn't fill
            mlen: TODO Lysandre didn't fill

        ::

                  same_length=False:      same_length=True:
                  <mlen > <  qlen >       <mlen > <  qlen >
               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]

        """
        attn_mask = tf.ones([qlen, qlen], dtype=dtype)
        mask_u = tf.matrix_band_part(attn_mask, 0, -1)
        mask_dia = tf.matrix_band_part(attn_mask, 0, 0)
        attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype)
        ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1)
        if self.same_length:
            mask_l = tf.matrix_band_part(attn_mask, -1, 0)
            ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1)
        return ret 
Example #28
Source File: model.py    From QANet_dureader with MIT License 5 votes vote down vote up
def _decode(self):

        N, PL, QL, CL, d, dc, nh = self._params()

        if self.config.use_position_attn:
            start_logits = tf.squeeze(
                conv(self._attention(tf.concat([self.enc[1], self.enc[2]], axis = -1), name="attn1"), 1, bias = False, name = "start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(self._attention(tf.concat([self.enc[1], self.enc[3]], axis = -1), name="attn2"), 1, bias = False, name = "end_pointer"), -1)
        else:
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis = -1), 1, bias = False, name = "start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis = -1), 1, bias = False, name = "end_pointer"), -1)

        self.logits = [mask_logits(start_logits, mask = tf.reshape(self.c_mask, [N, -1])),
                        mask_logits(end_logits, mask = tf.reshape(self.c_mask, [N, -1]))]

        self.logits1, self.logits2 = [l for l in self.logits]

        outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))

        outer = tf.matrix_band_part(outer, 0, self.max_a_len)
        self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) 
Example #29
Source File: common_attention.py    From NMT_GAN with Apache License 2.0 5 votes vote down vote up
def attention_bias_lower_triangle(length):
  """Create an bias tensor to be added to attention logits.

  Args:
   length: a Scalar.

  Returns:
    a `Tensor` with shape [1, 1, length, length].
  """
  lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
  ret = -1e9 * (1.0 - lower_triangle)
  return tf.reshape(ret, [1, 1, length, length]) 
Example #30
Source File: transformer_decoder.py    From bert-multitask-learning with MIT License 5 votes vote down vote up
def get_decoder_self_attention_mask(self, length):
        """Calculate bias for decoder that maintains model's autoregressive property.
        Creates a tensor that masks out locations that correspond to illegal
        connections, so prediction at position i cannot draw information from future
        positions.
        Args:
            length: int length of sequences in batch.
        Returns:
            float tensor of shape [1, 1, length, length]
        """
        with tf.name_scope("decoder_self_attention_mask"):
            valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
            valid_locs = tf.reshape(valid_locs, [1, length, length])
        return valid_locs