Python tensorflow.einsum() Examples

The following are 30 code examples of tensorflow.einsum(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function

Example #1

Source File: matmul.py From spektral with MIT License

7 votes

def mixed_mode_dot(a, b):
    """
    Computes the equivalent of `tf.einsum('ij,bjk->bik', a, b)`, but
    works for both dense and sparse inputs.
    :param a: Tensor or SparseTensor with rank 2.
    :param b: Tensor or SparseTensor with rank 3.
    :return: Tensor or SparseTensor with rank 3.
    """
    s_0_, s_1_, s_2_ = K.int_shape(b)
    B_T = ops.transpose(b, (1, 2, 0))
    B_T = ops.reshape(B_T, (s_1_, -1))
    output = dot(a, B_T)
    output = ops.reshape(output, (s_1_, s_2_, -1))
    output = ops.transpose(output, (2, 0, 1))

    return output

Example #2

Source File: gaussian_process.py From BERT with Apache License 2.0

6 votes

def laplace_attention(q, k, v, scale, normalise):
  """Computes laplace exponential attention.

  Args:
    q: queries. Tensor of shape [batch_size, m, d_k].
    k: keys. Tensor of shape [batch_size, n, d_k].
    v: values. Tensor of shape [batch_size, n, d_v].
    scale: float that scales the L1 distance.
    normalise: Boolean that determines whether weights sum to 1.

  Returns:
    Tensor of shape [batch_size, m, d_v].
  """
  k = tf.expand_dims(k, axis=1)  # [batch_size, 1, n, d_k]
  q = tf.expand_dims(q, axis=2)  # [batch_size, m, 1, d_k]
  unnorm_weights = - tf.abs((k - q) / scale)  # [batch_size, m, n, d_k]
  unnorm_weights = tf.reduce_sum(unnorm_weights, axis=-1)  # [batch_size, m, n]
  if normalise:
    weight_fn = tf.nn.softmax
  else:
    weight_fn = lambda x: 1 + tf.tanh(x)
  weights = weight_fn(unnorm_weights)  # [batch_size, m, n]
  rep = tf.einsum('bik,bkj->bij', weights, v)  # [batch_size, m, d_v]
  return rep

Example #3

Source File: input_moe_model.py From youtube-8m with Apache License 2.0

6 votes

def create_model(self,
                   model_input,
                   vocab_size,
                   num_mixtures=None,
                   l2_penalty=1e-8,
                   sub_scope="",
                   original_input=None, 
                   **unused_params):

    num_methods = model_input.get_shape().as_list()[-1]
    num_features = model_input.get_shape().as_list()[-2]

    original_input = tf.nn.l2_normalize(original_input, dim=1)
    gate_activations = slim.fully_connected(
        original_input,
        num_methods,
        activation_fn=tf.nn.softmax,
        weights_regularizer=slim.l2_regularizer(l2_penalty),
        scope="gates"+sub_scope)

    output = tf.einsum("ijk,ik->ij", model_input, gate_activations)
    return {"predictions": output}

Example #4

Source File: losses.py From youtube-8m with Apache License 2.0

6 votes

def calculate_loss(self, predictions, labels, weights=None, **unused_params):
    with tf.name_scope("loss_xent"):
      epsilon = 10e-6
      if FLAGS.label_smoothing:
        float_labels = smoothing(labels)
      else:
        float_labels = tf.cast(labels, tf.float32)
      cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + (
          1 - float_labels) * tf.log(1 - predictions + epsilon)
      cross_entropy_loss = tf.negative(cross_entropy_loss)
      if weights is not None:
        print cross_entropy_loss, weights
        weighted_loss = tf.einsum("ij,i->ij", cross_entropy_loss, weights)
        print "create weighted_loss", weighted_loss
        return tf.reduce_mean(tf.reduce_sum(weighted_loss, 1))
      else:
        return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))

Example #5

Source File: linear_regression_model.py From youtube-8m with Apache License 2.0

6 votes

def create_model(self, model_input, vocab_size, l2_penalty=1e-8, original_input=None, **unused_params):
    """Creates a linear regression model.

    Args:
      model_input: 'batch' x 'num_features' x 'num_methods' matrix of input features.
      vocab_size: The number of classes in the dataset.

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      batch_size x num_classes."""
    num_methods = model_input.get_shape().as_list()[-1]
    weight = tf.get_variable("ensemble_weight", 
        shape=[num_methods],
        regularizer=slim.l2_regularizer(l2_penalty))
    weight = tf.nn.softmax(weight)
    output = tf.einsum("ijk,k->ij", model_input, weight)
    return {"predictions": output}

Example #6

Source File: losses.py From youtube-8m with Apache License 2.0

6 votes

def calculate_loss(self, predictions, labels, weights=None, **unused_params):
    with tf.name_scope("loss_xent"):
      epsilon = 10e-6
      if FLAGS.label_smoothing:
        float_labels = smoothing(labels)
      else:
        float_labels = tf.cast(labels, tf.float32)
      cross_entropy_loss = float_labels * tf.log(predictions + epsilon) + (
          1 - float_labels) * tf.log(1 - predictions + epsilon)
      cross_entropy_loss = tf.negative(cross_entropy_loss)
      if weights is not None:
        print cross_entropy_loss, weights
        weighted_loss = tf.einsum("ij,i->ij", cross_entropy_loss, weights)
        print "create weighted_loss", weighted_loss
        return tf.reduce_mean(tf.reduce_sum(weighted_loss, 1))
      else:
        return tf.reduce_mean(tf.reduce_sum(cross_entropy_loss, 1))

Example #7

Source File: bnn_vi.py From zhusuan with MIT License

6 votes

def build_bnn(x, layer_sizes, n_particles):
    bn = zs.BayesianNet()
    h = tf.tile(x[None, ...], [n_particles, 1, 1])
    for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        w = bn.normal("w" + str(i), tf.zeros([n_out, n_in + 1]), std=1.,
                      group_ndims=2, n_samples=n_particles)
        h = tf.concat([h, tf.ones(tf.shape(h)[:-1])[..., None]], -1)
        h = tf.einsum("imk,ijk->ijm", w, h) / tf.sqrt(
            tf.cast(tf.shape(h)[2], tf.float32))
        if i < len(layer_sizes) - 2:
            h = tf.nn.relu(h)

    y_mean = bn.deterministic("y_mean", tf.squeeze(h, 2))
    y_logstd = tf.get_variable("y_logstd", shape=[],
                               initializer=tf.constant_initializer(0.))
    bn.normal("y", y_mean, logstd=y_logstd)
    return bn

Example #8

Source File: bnn_sgmcmc.py From zhusuan with MIT License

6 votes

def build_bnn(x, layer_sizes, logstds, n_particles):
    bn = zs.BayesianNet()
    h = tf.tile(x[None, ...], [n_particles, 1, 1])
    for i, (n_in, n_out) in enumerate(zip(layer_sizes[:-1], layer_sizes[1:])):
        w = bn.normal("w" + str(i), tf.zeros([n_out, n_in + 1]),
                      logstd=logstds[i], group_ndims=2, n_samples=n_particles)
        h = tf.concat([h, tf.ones(tf.shape(h)[:-1])[..., None]], -1)
        h = tf.einsum("imk,ijk->ijm", w, h) / tf.sqrt(
            tf.cast(tf.shape(h)[2], tf.float32))
        if i < len(layer_sizes) - 2:
            h = tf.nn.relu(h)

    y_mean = bn.deterministic("y_mean", tf.squeeze(h, 2))
    y_logstd = -0.95
    bn.normal("y", y_mean, logstd=y_logstd)
    return bn

Example #9

Source File: layers.py From Pixel2MeshPlusPlus with BSD 3-Clause "New" or "Revised" License

6 votes

def _call(self, inputs):
        x = inputs  # N, S, VF
        # dropout
        x = tf.nn.dropout(x, 1 - self.dropout)
        # convolve
        supports = list()
        for i in range(len(self.support)):
            pre_sup = tf.einsum('ijk,kl->ijl', x, self.vars['weights_' + str(i)])
            support = tf.einsum('ij,kjl->kil', self.support[i], pre_sup)
            supports.append(support)
        output = tf.add_n(supports)
        # bias
        if self.bias:
            output += self.vars['bias']

        return self.act(output)

Example #10

Source File: gaussian_process.py From BERT with Apache License 2.0

6 votes

def fit(self, x=None, y=None):
    # p(coeffs | x, y) = Normal(coeffs |
    #   mean = (1/noise_variance) (1/noise_variance x^T x + I)^{-1} x^T y,
    #   covariance = (1/noise_variance x^T x + I)^{-1})
    # TODO(trandustin): We newly fit the data at each call. Extend to do
    # Bayesian updating.
    kernel_matrix = tf.matmul(x, x, transpose_a=True) / self.noise_variance
    coeffs_precision = tf.matrix_set_diag(
        kernel_matrix, tf.matrix_diag_part(kernel_matrix) + 1.)
    coeffs_precision_tril = tf.linalg.cholesky(coeffs_precision)
    self.coeffs_precision_tril_op = tf.linalg.LinearOperatorLowerTriangular(
        coeffs_precision_tril)
    self.coeffs_mean = self.coeffs_precision_tril_op.solvevec(
        self.coeffs_precision_tril_op.solvevec(tf.einsum('nm,n->m', x, y)),
        adjoint=True) / self.noise_variance
    # TODO(trandustin): To be fully Keras-compatible, return History object.
    return

Example #11

Source File: gaussian_process.py From BERT with Apache License 2.0

6 votes

def call(self, inputs):
    if self.coeffs_mean is None and self.coeffs_precision_tril_op is None:
      # p(mean(ynew) | xnew) = Normal(ynew | mean = 0, variance = xnew xnew^T)
      predictive_mean = 0.
      predictive_variance = tf.reduce_sum(tf.square(inputs), -1)
    else:
      # p(mean(ynew) | xnew, x, y) = Normal(ynew |
      #   mean = xnew (1/noise_variance) (1/noise_variance x^T x + I)^{-1}x^T y,
      #   variance = xnew (1/noise_variance x^T x + I)^{-1} xnew^T)
      predictive_mean = tf.einsum('nm,m->n', inputs, self.coeffs_mean)
      predictive_covariance = tf.matmul(
          inputs,
          self.coeffs_precision_tril_op.solve(
              self.coeffs_precision_tril_op.solve(inputs, adjoint_arg=True),
              adjoint=True))
      predictive_variance = tf.diag_part(predictive_covariance)
    return ed.Normal(loc=predictive_mean, scale=tf.sqrt(predictive_variance))

Example #12

Source File: graph_attention.py From spektral with MIT License

6 votes

def _call_dense(self, X, A):
        shape = tf.shape(A)[:-1]
        A = tf.linalg.set_diag(A, tf.zeros(shape, A.dtype))
        A = tf.linalg.set_diag(A, tf.ones(shape, A.dtype))
        X = tf.einsum("...NI , IHO -> ...NHO", X, self.kernel)
        attn_for_self = tf.einsum("...NHI , IHO -> ...NHO", X, self.attn_kernel_self)
        attn_for_neighs = tf.einsum("...NHI , IHO -> ...NHO", X, self.attn_kernel_neighs)
        attn_for_neighs = tf.einsum("...ABC -> ...CBA", attn_for_neighs)

        attn_coef = attn_for_self + attn_for_neighs
        attn_coef = tf.nn.leaky_relu(attn_coef, alpha=0.2)

        mask = -10e9 * (1.0 - A)
        attn_coef += mask[..., None, :]
        attn_coef = tf.nn.softmax(attn_coef, axis=-1)
        attn_coef_drop = self.dropout(attn_coef)

        output = tf.einsum("...NHM , ...MHI -> ...NHI", attn_coef_drop, X)

        return output, attn_coef

Example #13

Source File: modeling.py From XLnet-gen with MIT License

6 votes

def post_attention(h, attn_vec, d_model, n_head, d_head, dropout, is_training,
                   kernel_initializer, residual=True):
  """Post-attention processing."""
  # post-attention projection (back to `d_model`)
  proj_o = tf.get_variable('o/kernel', [d_model, n_head, d_head],
                           dtype=h.dtype, initializer=kernel_initializer)
  attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o)

  attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
  if residual:
    output = tf.contrib.layers.layer_norm(attn_out + h, begin_norm_axis=-1,
                                          scope='LayerNorm')
  else:
    output = tf.contrib.layers.layer_norm(attn_out, begin_norm_axis=-1,
                                          scope='LayerNorm')

  return output

Example #14

Source File: bert_esim.py From BERT with Apache License 2.0

6 votes

def bert_layer_aggerate(encoding_lst, max_len, scope, reuse):
	with tf.variable_scope(scope, reuse=reuse):
		valid_tensor = tf.stack(encoding_lst, axis=1) # batch x num_layer x seq x dim
		attn = tf.get_variable(scope+"/layer_attention",
										dtype=tf.float32,
										shape=[len(encoding_lst),],
										initializer=tf.initializers.random_uniform(0,1))

		prob = tf.exp(tf.nn.log_softmax(attn))

		layer_repres = tf.einsum("abcd,b->acd", valid_tensor, prob)
		# layer_repres = encoding_lst[-1]
		# since input_target_a means b->a 
		# and input_target_b means a->b
		
		layer_repres = layer_repres[:,0:max_len,:]
		
		# print(" bert layer output shape w{}".format(layer_repres.get_shape()))
		return layer_repres

Example #15

Source File: bert_esim_v1.py From BERT with Apache License 2.0

6 votes

def bert_layer_aggerate(encoding_lst, 
						scope, reuse):
	with tf.variable_scope(scope, reuse=reuse):
		valid_tensor = tf.stack(encoding_lst, axis=1) # batch x num_layer x seq x dim
		attn = tf.get_variable(scope+"/layer_attention",
										dtype=tf.float32,
										shape=[len(encoding_lst),],
										initializer=tf.initializers.random_uniform(-0.01,0.01))

		prob = tf.exp(tf.nn.log_softmax(attn))

		layer_repres = tf.einsum("abcd,b->acd", valid_tensor, prob)
		# since input_target_a means b->a 
		# and input_target_b means a->b
		
		# print(" bert layer output shape w{}".format(layer_repres.get_shape()))
		return layer_repres

Example #16

Source File: textcnn.py From BERT with Apache License 2.0

5 votes

def build_output_logits(self, **kargs):
		input_tensor = self.sequence_output
		input_shape_list = bert_utils.get_shape_list(self.sequence_output, expected_rank=3)
		batch_size = input_shape_list[0]
		seq_length = input_shape_list[1]
		hidden_dims = input_shape_list[2]

		embedding_projection = kargs.get('embedding_projection', None)

		scope = kargs.get('scope', None)
		if scope:
			scope = scope + '/' + 'cls/predictions'
		else:
			scope = 'cls/predictions'

		tf.logging.info("**** mlm generator scope **** %s", str(scope))

		# with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE):
		with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):

			projection_width = self.config.emb_size

			with tf.variable_scope("transform"):
				input_tensor = tf.layers.dense(
						input_tensor,
						units=projection_width,
						activation=bert_modules.get_activation(self.config.hidden_act),
						kernel_initializer=bert_modules.create_initializer(
								self.config.initializer_range))

			output_bias = tf.get_variable(
					"output_bias",
					shape=[self.config.vocab_size],
					initializer=tf.zeros_initializer())
			# batch x seq x embedding
			logits = tf.einsum("abc,dc->abd", input_tensor, self.emb_mat)
			self.logits = tf.nn.bias_add(logits, output_bias)

Example #17

Source File: PNN_TensorFlow.py From Awesome-RecSystem-Models with MIT License

5 votes

def call(self, feat_index, feat_value, use_dropout=True):
        # embedding part
        feat_embedding = self.feat_embeddings(feat_index)          # Batch * N * M

        # linear part
        lz = tf.einsum('bnm,dnm->bd', feat_embedding, self.linear_weights)  # Batch * D1

        # quadratic part
        if self.product_type == 'inner':
            theta = tf.einsum('bnm,dn->bdnm', feat_embedding, self.theta)   # Batch * D1 * N * M
            lp = tf.einsum('bdnm,bdnm->bd', theta, theta)
        else:
            embed_sum = tf.reduce_sum(feat_embedding, axis=1)
            p = tf.einsum('bm,bn->bmn', embed_sum, embed_sum)
            lp = tf.einsum('bmn,dmn->bd', p, self.quadratic_weights)  # Batch * D1

        y_deep = tf.concat((lz, lp), axis=1)
        if use_dropout:
            y_deep = tf.keras.layers.Dropout(self.dropout_deep[0])(y_deep)

        for i in range(len(self.deep_layer_sizes)):
            y_deep = getattr(self, 'dense_' + str(i))(y_deep)
            y_deep = getattr(self, 'batchNorm_' + str(i))(y_deep)
            y_deep = getattr(self, 'activation_' + str(i))(y_deep)
            if use_dropout:
                y_deep = getattr(self, 'dropout_' + str(i))(y_deep)

        output = self.fc(y_deep)
        return output

Example #18

Source File: match_pyramid.py From BERT with Apache License 2.0

5 votes

def _encode(self, input_ids, input_char_ids,
				is_training, **kargs):

		reuse = kargs.get("reuse", None)

		with tf.variable_scope(self.config.scope+"_semantic_encode", reuse=reuse):

			emb_seq = self._embd_seq(input_ids, input_char_ids, is_training, reuse=reuse)

			if self.config.compress_emb:

				eW = tf.get_variable(self.scope+"_eW",
							 initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.2, dtype=tf.float32),
							 dtype=tf.float32,
							 shape=[emb_seq.shape[-1].value,
									self.config["embedding_dim_compressed"]])

				emb_seq = tf.einsum("abd,dc->abc", emb_seq, eW)

			input_dim = emb_seq.shape[-1].value
			input_mask = tf.cast(input_ids, tf.bool)
			input_len = tf.reduce_sum(tf.cast(input_mask, tf.int32), -1)

			enc_seq = encode(emb_seq, method=self.config["encode_method"],
								 input_dim=input_dim,
								 params=self.config,
								 sequence_length=input_len,
								 mask_zero=self.config["embedding_mask_zero"],
								 scope_name=self.scope + "enc_seq", 
								 reuse=reuse,
								 training=is_training)

		return emb_seq, enc_seq

Example #19

Source File: match_pyramid.py From BERT with Apache License 2.0

5 votes

def _semantic_interaction(self, input_ids_a, input_char_ids_a, 
				input_ids_b, input_char_ids_b,
				emb_seq_a, enc_seq_a, emb_seq_b, enc_seq_b,
				is_training, **kargs):

		emb_match_matrix_dot_product = tf.einsum("abd,acd->abc", emb_seq_a, emb_seq_b)
		emb_match_matrix_dot_product = tf.expand_dims(emb_match_matrix_dot_product, axis=-1) # batch x seq_len_a x seq_len_b x 1

		match_matrix_identity = tf.expand_dims(tf.cast(
			tf.equal(
				tf.expand_dims(input_ids_a, 2),
				tf.expand_dims(input_ids_b, 1)
			), tf.float32), axis=-1) # batch x seq_len_a x seq_len_b x 1

		input_mask_a = tf.expand_dims(tf.cast(tf.cast(input_ids_a, tf.bool), tf.float32), axis=2) # batch x seq_len_a x 1
		input_mask_b = tf.expand_dims(tf.cast(tf.cast(input_ids_b, tf.bool), tf.float32), axis=1) # batch x 1 x seq_len_b

		match_matrix_identity *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1)

		emb_match_matrix_element_product = tf.expand_dims(emb_seq_a, 2) * tf.expand_dims(
			emb_seq_b, 1)
		# emb_match_matrix_element_product *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1)

		enc_match_matrix_dot_product = tf.expand_dims(
			tf.einsum("abd,acd->abc", enc_seq_a, enc_seq_b), axis=-1)
		# enc_match_matrix_dot_product *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1)

		enc_match_matrix_element_product = tf.expand_dims(enc_seq_a, 2) * tf.expand_dims(
			enc_seq_b, 1)
		# enc_match_matrix_element_product *= tf.expand_dims(input_mask_a*input_mask_b, axis=-1)

		match_matrix = tf.concat([
			emb_match_matrix_dot_product,
			match_matrix_identity,
			emb_match_matrix_element_product,
			enc_match_matrix_dot_product,
			enc_match_matrix_element_product
		], axis=-1)

		return match_matrix

Example #20

Source File: textcnn.py From BERT with Apache License 2.0

5 votes

def build_other_output_logits(self, sequence_output, **kargs):
		input_tensor = sequence_output
		input_shape_list = bert_utils.get_shape_list(sequence_output, expected_rank=3)
		batch_size = input_shape_list[0]
		seq_length = input_shape_list[1]
		hidden_dims = input_shape_list[2]

		embedding_projection = kargs.get('embedding_projection', None)

		scope = kargs.get('scope', None)
		if scope:
			scope = scope + '/' + 'cls/predictions'
		else:
			scope = 'cls/predictions'

		tf.logging.info("**** mlm generator scope **** %s", str(scope))

		# with tf.variable_scope("cls/predictions", reuse=tf.AUTO_REUSE):
		with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):

			projection_width = self.config.emb_size

			with tf.variable_scope("transform"):
				input_tensor = tf.layers.dense(
						input_tensor,
						units=projection_width,
						activation=bert_modules.get_activation(self.config.hidden_act),
						kernel_initializer=bert_modules.create_initializer(
								self.config.initializer_range))

			output_bias = tf.get_variable(
					"output_bias",
					shape=[self.config.vocab_size],
					initializer=tf.zeros_initializer())
			# batch x seq x embedding
			logits = tf.einsum("abc,dc->abd", input_tensor, self.emb_mat)
			logits = tf.nn.bias_add(logits, output_bias)
			return logits

Example #21

Source File: attention_rectified_linear_model.py From youtube-8m with Apache License 2.0

5 votes

def create_model(self,
                   model_input,
                   vocab_size,
                   num_mixtures=None,
                   l2_penalty=1e-8,
                   sub_scope="",
                   original_input=None, 
                   **unused_params):

    num_methods = model_input.get_shape().as_list()[-1]
    num_features = model_input.get_shape().as_list()[-2]
    num_mixtures = FLAGS.moe_num_mixtures

    # gating coefficients
    original_input = tf.nn.l2_normalize(original_input, dim=1)
    mean_output = tf.reduce_mean(model_input, axis=2)
    ## batch_size x moe_num_mixtures
    gate_activations = slim.fully_connected(
        tf.concat([original_input, mean_output], axis=1),
        num_mixtures,
        activation_fn=tf.nn.softmax,
        weights_regularizer=slim.l2_regularizer(l2_penalty),
        scope="gates"+sub_scope)

    # matrix
    weight_var = tf.get_variable("ensemble_weight",
        shape=[num_mixtures, num_methods],
        regularizer=slim.l2_regularizer(l2_penalty))

    # weight
    gated_weight = tf.einsum("ij,jk->ik", gate_activations, weight_var)
    rl_gated_weight = tf.nn.relu(gated_weight) + 1e-9
    sum_gated_weight = tf.reduce_sum(rl_gated_weight, axis=1, keep_dims=True)
    weight = rel_gated_weight / sum_gated_weight
    
    # weighted output
    output = tf.einsum("ik,ijk->ij", weight, model_input)
    return {"predictions": output}

Example #22

Source File: attention_linear_model.py From youtube-8m with Apache License 2.0

5 votes

def create_model(self,
                   model_input,
                   vocab_size,
                   num_mixtures=None,
                   l2_penalty=1e-8,
                   sub_scope="",
                   original_input=None, 
                   **unused_params):

    num_methods = model_input.get_shape().as_list()[-1]
    num_features = model_input.get_shape().as_list()[-2]
    num_mixtures = FLAGS.moe_num_mixtures

    # gating coefficients
    original_input = tf.nn.l2_normalize(original_input, dim=1)
    mean_output = tf.reduce_mean(model_input, axis=2)
    ## batch_size x moe_num_mixtures
    gate_activations = slim.fully_connected(
        tf.concat([original_input, mean_output], axis=1),
        num_mixtures,
        activation_fn=tf.nn.softmax,
        weights_regularizer=slim.l2_regularizer(l2_penalty),
        scope="gates"+sub_scope)

    # matrix
    weight_var = tf.get_variable("ensemble_weight",
        shape=[num_mixtures, num_methods],
        regularizer=slim.l2_regularizer(l2_penalty))

    # weight
    gated_weight = tf.einsum("ij,jk->ik", gate_activations, weight_var)
    weight = tf.nn.softmax(gated_weight)
    
    # weighted output
    output = tf.einsum("ik,ijk->ij", weight, model_input)
    return {"predictions": output}

Example #23

Source File: attention_moe_model.py From youtube-8m with Apache License 2.0

5 votes

def create_model(self,
                   model_input,
                   vocab_size,
                   num_mixtures=None,
                   l2_penalty=1e-8,
                   sub_scope="",
                   original_input=None, 
                   **unused_params):

    num_relu = FLAGS.attention_relu_cells
    num_methods = model_input.get_shape().as_list()[-1]
    num_features = model_input.get_shape().as_list()[-2]

    original_input = tf.nn.l2_normalize(original_input, dim=1)
    model_input_list = tf.unstack(model_input, axis=2)
    
    relu_units = [self.relu(original_input, num_relu, sub_scope="input")]
    i = 0
    for mi in model_input_list:
      relu_units.append(self.relu(mi, num_relu, sub_scope="sub"+str(i)))
      i += 1

    gate_activations = slim.fully_connected(
        tf.concat(relu_units, axis=1),
        num_methods,
        activation_fn=None,
        biases_initializer=None,
        weights_regularizer=slim.l2_regularizer(l2_penalty),
        scope="gate")
    gate = tf.nn.softmax(gate_activations)
    output = tf.einsum("ijk,ik->ij", model_input, gate)
    return {"predictions": output}

Example #24

Source File: man_utils.py From BERT with Apache License 2.0

5 votes

def minus_attention(query, context, 
                query_mask, context_mask, dropout_ratio,
                scope, reuse=None):
    
    hidden_dim = query.get_shape()[-1]
    Wm = tf.get_variable("Wm", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

    Vm = tf.get_variable("Vm", dtype=tf.float32,
                                    shape=[hidden_dim, 1],
                                    initializer=initializer)

    # batch x len_query x 1 x hidden_dim
    query_ = tf.expand_dims(query, 2)
    # batch x 1 x len_context x hidden_dim
    context_ = tf.expand_dims(context, 1)

    # batch x len_query x len_context x hidden_dim
    minus_attention = tf.abs(query_ - context_)

    minus_attention = tf.einsum("abcd,de->abce", minus_attention, Wm)
    minus_attention = tf.einsum("abce,ef->abcf", minus_attention, Vm)

    # batch x len_query x len_context
    S = tf.squeeze(minus_attention, -1)
    mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len
    mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len

    S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c))
    c2q = tf.matmul(S_, context) 

    S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q))
    q2c = tf.matmul(S_T, query)

    return c2q, q2c

Example #25

Source File: man_utils.py From BERT with Apache License 2.0

5 votes

def dot_attention(query, context,
                query_mask, context_mask, dropout_ratio,
                scope, reuse=None):

    hidden_dim = query.get_shape()[-1]
    Wd = tf.get_variable("Wd", dtype=tf.float32,
                                    shape=[hidden_dim, hidden_dim],
                                    initializer=initializer)

    Vd = tf.get_variable("Vd", dtype=tf.float32,
                                    shape=[hidden_dim, 1],
                                    initializer=initializer)

    # batch x len_query x 1 x hidden_dim
    query_ = tf.expand_dims(query, 2)
    # batch x 1 x len_context x hidden_dim
    context_ = tf.expand_dims(context, 1)

    # batch x len_query x len_context x hidden_dim
    dot_attention = query_ * context_
    dot_attention = tf.einsum("abcd,de->abce", dot_attention, Wd)
    dot_attention = tf.einsum("abce,ef->abcf", dot_attention, Vd)

    # batch x len_query x len_context
    S = tf.squeeze(dot_attention, -1)
    mask_q = tf.expand_dims(query_mask, 1) # batch x 1 x query_len
    mask_c = tf.expand_dims(context_mask, 1) # batch x 1 x context_len

    S_ = tf.nn.softmax(qanet_layers.mask_logits(S, mask = mask_c))
    c2q = tf.matmul(S_, context) 

    S_T = tf.nn.softmax(qanet_layers.mask_logits(tf.transpose(S, [0,2,1]), mask = mask_q))
    q2c = tf.matmul(S_T, query)

    return c2q, q2c

Example #26

Source File: generator.py From UROP-Adversarial-Feature-Matching-for-Text-Generation with GNU Affero General Public License v3.0

5 votes

def lstm(self, prev_y, prev_h, prev_c, z):
		hs = self.hidden_size

		preact = tf.einsum('ijk,ka->ija', prev_h, self.h2h_W) + \
				 tf.einsum('ijk,ka->ija', prev_y, self.i2h_W) + \
				 tf.matmul(z, self.z2h_W) + \
				 self.b # preactivation
		# [1, batch_size, hidden_size * 4]
		i = tf.sigmoid(preact[:, :, 0*hs: 1*hs])
		f = tf.sigmoid(preact[:, :, 1*hs: 2*hs])
		o = tf.sigmoid(preact[:, :, 2*hs: 3*hs])
		c = tf.tanh(preact[:, :, 3*hs: 4*hs])
		c = f * prev_c + i * c # [1, batch_size, hidden_size] (element-wise multiply)
		h = o * tf.tanh(c) # [1, batch_size, hidden_size]
		y = tf.einsum('ijk,ka->ija', h, self.Vhid) + self.bhid # [1, batch_size, vocab_size]

		# Author doesn't mention this part in his paper, but it appers in his code
		# So I assume this is part of his soft-max approx. strategy ---|
		max_y = tf.reduce_max(y, axis=1, keep_dims=True) # [1, 1, vocab_size]
		e = tf.exp((y - max_y) * self.L)  # [1, batch_size, vocab_size]
		w = e / tf.reduce_sum(e, axis=1, keep_dims=True) # [1, batch_size, vocab_size]
		# Assumption ends here ----------------------------------------|

		y = tf.einsum('ijk,ka->ija', w, self.Wemb) # [1, batch_size, input_dim]
		
		return y, h, c

Example #27

Source File: trf_bert_ebm_gpt.py From BERT with Apache License 2.0

5 votes

def ebm_logz_length_cond_loss(config, features, ebm_all_loss, valid_mask=None):
	"""
	we group by length and mean over loss by length
	and apply sgd to optimize logz's parameters just like center-loss for center updating
	"""
	input_mask = features['input_mask']
	shape = bert_utils.get_shape_list(input_mask)
	valid_seq_length = tf.cast(tf.reduce_sum(input_mask, axis=-1), tf.int32) # batch_size
	onehot_length_ids = tf.one_hot(valid_seq_length, config.max_position_embeddings)
	onehot_length_ids = tf.cast(onehot_length_ids, tf.float32)

	if_provided = 1
	if valid_mask is None:
		valid_mask = tf.ones(shape=[shape[0]])
		if_provided = 0
		tf.logging.info("====ones valid mask ====")
	if if_provided == 1:
		tf.logging.info("====provided valid mask ====")

	valid_mask = tf.expand_dims(tf.cast(valid_mask, tf.float32), axis=-1) # batch_size x 1

	length_accumulate_loss = tf.einsum("ab,a->ab", onehot_length_ids, ebm_all_loss)
	length_loss = tf.reduce_sum(length_accumulate_loss*valid_mask, axis=0)

	length_appear_time = tf.reduce_sum(onehot_length_ids*valid_mask, axis=0) + 1

	logz_length_attribute_loss = length_loss / length_appear_time # 1 x max_position_embeddings
	logz_length_loss = tf.reduce_sum(logz_length_attribute_loss)
	return logz_length_loss

Example #28

Source File: classifier_adapter.py From BERT with Apache License 2.0

5 votes

def multi_choice_classifier(config, pooled_output, 
		num_labels, labels, dropout_prob):
	output_layer = pooled_output
	
	final_hidden_shape = bert_utils.get_shape_list(output_layer, 
								expected_rank=2)

	print(final_hidden_shape, "====multi-choice shape====")

	output_layer = tf.reshape(output_layer, 
								[-1,
								num_labels,
								final_hidden_shape[-1]]) # batch x num_choices x hidden_dim

	hidden_size = output_layer.shape[-1].value

	output_weights = tf.get_variable(
			"output_weights", [hidden_size],
			initializer=tf.truncated_normal_initializer(stddev=0.02))

	output_bias = tf.get_variable(
			"output_bias", [num_labels], initializer=tf.zeros_initializer())

	output_layer = tf.nn.dropout(output_layer, keep_prob=1 - dropout_prob)
	logits = tf.einsum("abc,c->ab", output_layer, output_weights)
	logits = tf.nn.bias_add(logits, output_bias) # batch x num_labels

	if config.get("loss_type", "entropy") == "focal_loss":
		per_example_loss = loss_utils.focal_loss_multi_v1(logits=logits, 
													labels=labels)
	else:
		per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
												logits=logits, 
												labels=tf.stop_gradient(labels))
	loss = tf.reduce_mean(per_example_loss)

	return (loss, per_example_loss, logits)

Example #29

Source File: engineer_transformer.py From youtube-8m with Apache License 2.0

5 votes

def std(self, model_input_raw, num_frames, mask):
    mean_input = self.avg(model_input_raw, num_frames, mask)
    error = tf.einsum("ijk,ij->ijk", model_input_raw - mean_input, mask)
    return error

Example #30

Source File: albert_modules_official.py From BERT with Apache License 2.0

5 votes

def dense_layer_3d_proj(input_tensor,
												hidden_size,
												head_size,
												initializer,
												activation,
												name=None):
	"""A dense layer with 3D kernel for projection.
	Args:
		input_tensor: float Tensor of shape [batch,from_seq_length,
			num_attention_heads, size_per_head].
		hidden_size: The size of hidden layer.
		num_attention_heads: The size of output dimension.
		head_size: The size of head.
		initializer: Kernel initializer.
		activation: Actication function.
		name: The name scope of this layer.
	Returns:
		float logits Tensor.
	"""
	input_shape = albert_utils_official.get_shape_list(input_tensor)
	num_attention_heads= input_shape[2]
	with tf.variable_scope(name):
		w = tf.get_variable(
				name="kernel",
				shape=[num_attention_heads * head_size, hidden_size],
				initializer=initializer)
		w = tf.reshape(w, [num_attention_heads, head_size, hidden_size])
		b = tf.get_variable(
				name="bias", shape=[hidden_size], initializer=tf.zeros_initializer)
		ret = tf.einsum("BFND,NDH->BFH", input_tensor, w)
		ret += b
	if activation is not None:
		return activation(ret)
	else:
		return ret