Python tensorflow.compat.v1.zeros_initializer() Examples

The following are 30 code examples of tensorflow.compat.v1.zeros_initializer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.compat.v1 , or try the search function .
Example #1
Source File: neural_stack.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def add_vector_projection(self, name, size):
    """A helper function for mapping embedding controller outputs.

    Args:
      name: A prefix for the variable names.
      size: The desired number of embedding outputs.

    Returns:
      A tuple of (weights, bias) where weights has shape
      [num_units, size * embedding_size] and bias has shape
      [size * embedding_size].
    """
    weights = self.add_variable(
        name + "_projection_weights",
        shape=[self._num_units, size * self._embedding_size],
        dtype=self.dtype)
    bias = self.add_variable(
        name + "_projection_bias",
        shape=[size * self._embedding_size],
        initializer=tf.zeros_initializer(dtype=self.dtype))
    return weights, bias 
Example #2
Source File: export_checkpoints.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_mlm_logits(input_tensor, albert_config, mlm_positions, output_weights):
  """From run_pretraining.py."""
  input_tensor = gather_indexes(input_tensor, mlm_positions)
  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=albert_config.embedding_size,
          activation=modeling.get_activation(albert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              albert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[albert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(
        input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
  return logits 
Example #3
Source File: transformer_glow_layers_ops.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def dense_weightnorm(
    name, x, n_out, x_mask, init_scale, init, dtype=tf.float32):
  """Dense layer with weight normalization."""
  n_in = common_layers.shape_list(x)[2]
  eps = tf.keras.backend.epsilon()
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    v = tf.get_variable(
        "v", [n_in, n_out], dtype,
        initializer=tf.random_normal_initializer(0, 0.05), trainable=True)
    v = v / tf.norm(v, axis=0, keepdims=True)
    t = tf.matmul(x, v)  # [B, L, n_out]
    mean, var = moments_over_bl(t, x_mask)
    g_init = init_scale / (tf.sqrt(var) + eps)
    g = get_variable_ddi(
        "g", [n_out], g_init, init,
        initializer=tf.zeros_initializer, dtype=dtype, trainable=True)
    b = get_variable_ddi(
        "b", [n_out], -mean*g_init, init,
        initializer=tf.zeros_initializer, dtype=dtype, trainable=True)
    w = g * v
    y = tf.matmul(x, w) + b
    tf.summary.histogram("_g", g)
    return y 
Example #4
Source File: common_layers.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def zero_add(previous_value, x, name=None, reuse=None):
  """Resnet connection with zero initialization.

  Another type of resnet connection which returns previous_value + gamma * x.
  gamma is a trainable scalar and initialized with zero. It is useful when a
  module is plugged into a trained model and we want to make sure it matches the
  original model's performance.

  Args:
    previous_value:  A tensor.
    x: A tensor.
    name: name of variable scope; defaults to zero_add.
    reuse: reuse scope.

  Returns:
    previous_value + gamma * x.
  """
  with tf.variable_scope(name, default_name="zero_add", reuse=reuse):
    gamma = tf.get_variable("gamma", (), initializer=tf.zeros_initializer())
    return previous_value + gamma * x 
Example #5
Source File: export_checkpoints.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_sentence_order_logits(input_tensor, albert_config):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    return logits 
Example #6
Source File: common_layers.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def group_norm(x, filters=None, num_groups=8, epsilon=1e-5):
  """Group normalization as in https://arxiv.org/abs/1803.08494."""
  x_shape = shape_list(x)
  if filters is None:
    filters = x_shape[-1]
  assert len(x_shape) == 4
  assert filters % num_groups == 0
  # Prepare variables.
  scale = tf.get_variable(
      "group_norm_scale", [filters], initializer=tf.ones_initializer())
  bias = tf.get_variable(
      "group_norm_bias", [filters], initializer=tf.zeros_initializer())
  epsilon, scale, bias = [cast_like(t, x) for t in [epsilon, scale, bias]]
  # Reshape and compute group norm.
  x = tf.reshape(x, x_shape[:-1] + [num_groups, filters // num_groups])
  # Calculate mean and variance on heights, width, channels (not groups).
  mean, variance = tf.nn.moments(x, [1, 2, 4], keep_dims=True)
  norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
  return tf.reshape(norm_x, x_shape) * scale + bias 
Example #7
Source File: ops_test.py    From mesh with Apache License 2.0 6 votes vote down vote up
def testVariableOperations(self):
    var = mtf.Variable(self.mesh,
                       "test_variable",
                       self.ab_shape,
                       mtf.VariableDType(tf.int32, tf.int32, tf.int32),
                       initializer=tf.zeros_initializer(),
                       trainable=True)

    self.assertEqual(var.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(var.unsplittable_dims, frozenset())

    read_variable = mtf.ReadVariable(var)
    self.assertEqual(read_variable.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(read_variable.unsplittable_dims, frozenset())

    assign = mtf.Assign([var], [self.x])
    self.assertEqual(assign.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(assign.unsplittable_dims, frozenset())

    depend = mtf.Depend(read_variable.outputs[0], [assign])
    self.assertEqual(depend.splittable_dims, frozenset(["a", "b"]))
    self.assertEqual(depend.unsplittable_dims, frozenset()) 
Example #8
Source File: transformer_glow_layers.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def actnorm(name, x, x_mask, inverse, init, logscale_factor=3.0):
  """Activation normalization, returns logabsdet of shape [B]."""
  eps = tf.keras.backend.epsilon()
  n_channels = common_layers.shape_list(x)[2]

  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    x_mean, x_var = gops.moments_over_bl(x, x_mask)
    b = gops.get_variable_ddi(
        "b", (n_channels), -x_mean, init, tf.zeros_initializer)
    log_w_init = -0.5 * tf.log(x_var + eps) / logscale_factor
    log_w = gops.get_variable_ddi(
        "log_w", (n_channels), log_w_init, init,
        tf.zeros_initializer) * logscale_factor

    if not inverse:
      x = (x + b) * tf.exp(log_w)
    else:
      x = x * tf.exp(-log_w) - b

    x_length = tf.reduce_sum(x_mask, -1)
    logabsdet = x_length * tf.reduce_sum(log_w)
    if inverse:
      logabsdet *= -1
    return x, logabsdet 
Example #9
Source File: common_layers.py    From language with Apache License 2.0 6 votes vote down vote up
def linear_transform(x, output_size, scope, bias=False, input_size=None):
  """Simple linear transform of x.

  Args:
    x: <float>[batch_size, length, input_size]
    output_size: Integer specifying output size.
    scope: String name for variable scope.
    bias: If True, adds a learned bias term.
    input_size: Explicitly specify input_size if not set as static shape.

  Returns:
    <float>[batch_size, length, output_size]
  """
  input_size = input_size or x.get_shape()[-1]
  with tf.variable_scope(scope):
    batch_size = tf.shape(x)[0]
    weights = tf.get_variable("weights", shape=(input_size, output_size))
    weights = tf.expand_dims(weights, 0)
    weights = tf.tile(weights, [batch_size, 1, 1])
    x = tf.matmul(x, weights)
    if bias:
      bias = tf.get_variable(
          "bias", shape=(output_size), initializer=tf.zeros_initializer())
      x += bias
    return x 
Example #10
Source File: common_layers.py    From language with Apache License 2.0 6 votes vote down vote up
def apply_norm(x, epsilon=1e-6):
  """Applies layer normalization to x.

  Based on "Layer Normalization":
  https://arxiv.org/abs/1607.06450

  Args:
    x: <float>[..., input_size]
    epsilon: Used to avoid division by 0.

  Returns:
    <float>[..., input_size]
  """
  input_size = x.get_shape()[-1]
  with tf.variable_scope("layer_norm", values=[x]):
    scale = tf.get_variable(
        "layer_norm_scale", [input_size], initializer=tf.ones_initializer())
    bias = tf.get_variable(
        "layer_norm_bias", [input_size], initializer=tf.zeros_initializer())
    mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
    variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
    norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
    result = norm_x * scale + bias
    return result 
Example #11
Source File: glow_ops.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def scale_gaussian_prior(name, z, logscale_factor=3.0, trainable=True):
  """Returns N(s^i * z^i, std^i) where s^i and std^i are pre-component.

  s^i is a learnable parameter with identity initialization.
  std^i is optionally learnable with identity initialization.

  Args:
    name: variable scope.
    z: input_tensor
    logscale_factor: equivalent to scaling up the learning_rate by a factor
                     of logscale_factor.
    trainable: Whether or not std^i is learnt.
  """
  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
    z_shape = common_layers.shape_list(z)
    latent_multiplier = tf.get_variable(
        "latent_multiplier", shape=z_shape, dtype=tf.float32,
        initializer=tf.ones_initializer())
    log_scale = tf.get_variable(
        "log_scale_latent", shape=z_shape, dtype=tf.float32,
        initializer=tf.zeros_initializer(), trainable=trainable)
    log_scale = log_scale * logscale_factor
    return tfp.distributions.Normal(
        loc=latent_multiplier * z, scale=tf.exp(log_scale)) 
Example #12
Source File: neural_stack.py    From tensor2tensor with Apache License 2.0 6 votes vote down vote up
def add_scalar_projection(self, name, size):
    """A helper function for mapping scalar controller outputs.

    Args:
      name: A prefix for the variable names.
      size: The desired number of scalar outputs.

    Returns:
      A tuple of (weights, bias) where weights has shape [num_units, size] and
      bias has shape [size].
    """
    weights = self.add_variable(
        name + "_projection_weights",
        shape=[self._num_units, size],
        dtype=self.dtype)
    bias = self.add_variable(
        name + "_projection_bias",
        shape=[size],
        initializer=tf.zeros_initializer(dtype=self.dtype))
    return weights, bias 
Example #13
Source File: export_to_tfhub.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_mlm_logits(model, albert_config, mlm_positions):
  """From run_pretraining.py."""
  input_tensor = gather_indexes(model.get_sequence_output(), mlm_positions)
  with tf.variable_scope("cls/predictions"):
    # We apply one more non-linear transformation before the output layer.
    # This matrix is not used after pre-training.
    with tf.variable_scope("transform"):
      input_tensor = tf.layers.dense(
          input_tensor,
          units=albert_config.embedding_size,
          activation=modeling.get_activation(albert_config.hidden_act),
          kernel_initializer=modeling.create_initializer(
              albert_config.initializer_range))
      input_tensor = modeling.layer_norm(input_tensor)

    # The output weights are the same as the input embeddings, but there is
    # an output-only bias for each token.
    output_bias = tf.get_variable(
        "output_bias",
        shape=[albert_config.vocab_size],
        initializer=tf.zeros_initializer())
    logits = tf.matmul(
        input_tensor, model.get_embedding_table(), transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
  return logits 
Example #14
Source File: mnist_benchmark.py    From autograph with Apache License 2.0 6 votes vote down vote up
def get_data_and_params():
  """Set up input dataset and variables."""
  (train_x, train_y), _ = tf.keras.datasets.mnist.load_data()
  tf.set_random_seed(0)
  hparams = contrib_training.HParams(
      batch_size=200,
      learning_rate=0.1,
      train_steps=101,
  )
  dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
  dataset = dataset.repeat()
  dataset = dataset.shuffle(hparams.batch_size * 10)
  dataset = dataset.batch(hparams.batch_size)

  def reshape_ex(x, y):
    return (tf.to_float(tf.reshape(x, (-1, 28 * 28))) / 256.0,
            tf.one_hot(tf.squeeze(y), 10))

  dataset = dataset.map(reshape_ex)
  w = tf.get_variable('w0', (28 * 28, 10))
  b = tf.get_variable('b0', (10,), initializer=tf.zeros_initializer())
  opt = tf.train.GradientDescentOptimizer(hparams.learning_rate)
  return dataset, opt, hparams, w, b 
Example #15
Source File: slate_decomp_q_agent.py    From recsim with Apache License 2.0 6 votes vote down vote up
def _build_select_slate_op(self):
    p_no_click = self._prob_no_click_ph
    p = self._doc_affinity_scores_ph
    q = self._net_outputs.q_values[0]
    with tf.name_scope('select_slate'):
      self._output_slate = self._select_slate_fn(self._slate_size, p_no_click,
                                                 p, q)

    self._output_slate = tf.Print(
        self._output_slate, [tf.constant('cp 1'), self._output_slate, p, q],
        summarize=10000)
    self._output_slate = tf.reshape(self._output_slate, (self._slate_size,))

    self._action_counts = tf.get_variable(
        'action_counts',
        shape=[self._num_candidates],
        initializer=tf.zeros_initializer())
    output_slate = tf.reshape(self._output_slate, [-1])
    output_one_hot = tf.one_hot(output_slate, self._num_candidates)
    update_ops = []
    for i in range(self._slate_size):
      update_ops.append(tf.assign_add(self._action_counts, output_one_hot[i]))
    self._select_action_update_op = tf.group(*update_ops) 
Example #16
Source File: bert_as_summarizer.py    From DeepPavlov with Apache License 2.0 6 votes vote down vote up
def _init_graph(self):
        self._init_placeholders()

        self.bert = BertModel(config=self.bert_config,
                              is_training=self.is_train_ph,
                              input_ids=self.input_ids_ph,
                              input_mask=self.input_masks_ph,
                              token_type_ids=self.token_types_ph,
                              use_one_hot_embeddings=False,
                              )
        # next sentence prediction head
        with tf.variable_scope("cls/seq_relationship"):
            output_weights = tf.get_variable(
                "output_weights",
                shape=[2, self.bert_config.hidden_size],
                initializer=create_initializer(self.bert_config.initializer_range))
            output_bias = tf.get_variable(
                "output_bias", shape=[2], initializer=tf.zeros_initializer())

        nsp_logits = tf.matmul(self.bert.get_pooled_output(), output_weights, transpose_b=True)
        nsp_logits = tf.nn.bias_add(nsp_logits, output_bias)
        self.nsp_probs = tf.nn.softmax(nsp_logits, axis=-1) 
Example #17
Source File: tiled_linear.py    From lamb with Apache License 2.0 6 votes vote down vote up
def _build_tiled_linear(self, inputs, input_name_and_sizes,
                          output_name_and_sizes, add_bias):
    results = []
    for output_name, output_size in output_name_and_sizes:
      r = 0.0
      for input_, (input_name, input_size) in zip(inputs, input_name_and_sizes):
        name = 'W_{}_{}'.format(input_name, output_name)
        weight = self._get_variable(
            name, shape=[output_size, input_size])
        r += tf.sparse_tensor_dense_matmul(weight, input_, adjoint_b=True)
      r = tf.transpose(r)
      if add_bias:
        # Biases are dense, hence we call _get_variable of the base
        # class.
        r += super(SparseTiledLinear, self)._get_variable(
            'B_{}'.format(output_name), shape=[output_size],
            default_initializer=tf.zeros_initializer())
      results.append(r)
    return results


# TODO(melisgl): Since computation is the same as in TiledLinear,
# perhaps this should be implemented as a custom getter (see
# tf.get_variable) instead of being tied to tiling. 
Example #18
Source File: run_pretraining.py    From albert with Apache License 2.0 6 votes vote down vote up
def get_sentence_order_output(albert_config, input_tensor, labels):
  """Get loss and log probs for the next sentence prediction."""

  # Simple binary classification. Note that 0 is "next sentence" and 1 is
  # "random sentence". This weight matrix is not used after pre-training.
  with tf.variable_scope("cls/seq_relationship"):
    output_weights = tf.get_variable(
        "output_weights",
        shape=[2, albert_config.hidden_size],
        initializer=modeling.create_initializer(
            albert_config.initializer_range))
    output_bias = tf.get_variable(
        "output_bias", shape=[2], initializer=tf.zeros_initializer())

    logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    labels = tf.reshape(labels, [-1])
    one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, per_example_loss, log_probs) 
Example #19
Source File: utils.py    From lamb with Apache License 2.0 6 votes vote down vote up
def layer_norm(x, reduction_indices, epsilon=1e-9, gain=None, bias=None,
               per_element=True, scope=None):
  """DOC."""
  reduction_indices = ensure_list(reduction_indices)
  mean = tf.reduce_mean(x, reduction_indices, keep_dims=True)
  variance = tf.reduce_mean(tf.squared_difference(x, mean),
                            reduction_indices, keep_dims=True)
  normalized = (x - mean) / tf.sqrt(variance + epsilon)
  dtype = x.dtype
  shape = x.get_shape().as_list()
  for i in six.moves.range(len(shape)):
    if i not in reduction_indices or not per_element:
      shape[i] = 1
  with tf.variable_scope(scope or 'layer_norm'):
    if gain is None:
      gain = tf.get_variable('gain', shape=shape, dtype=dtype,
                             initializer=tf.ones_initializer())
    if bias is None:
      bias = tf.get_variable('bias', shape=shape, dtype=dtype,
                             initializer=tf.zeros_initializer())
  return gain*normalized+bias 
Example #20
Source File: averaged.py    From lamb with Apache License 2.0 6 votes vote down vote up
def __init__(self, tensors):
    tensors = list(tensors)
    with tf.variable_scope('averaged'):
      self._num_samples = tf.Variable(0, name='num_samples', trainable=False)
      with tf.variable_scope('avg'):
        self._averages = [
            tf.get_variable(
                tensor.name.replace('/', '-').replace(':', '-'),
                tensor.get_shape(), initializer=tf.zeros_initializer(),
                trainable=False)
            for tensor in tensors]
      with tf.variable_scope('save'):
        self._saves = [
            tf.get_variable(
                tensor.name.replace('/', '-').replace(':', '-'),
                tensor.get_shape(), initializer=tf.zeros_initializer(),
                trainable=False)
            for tensor in tensors]
    self._tensors = tensors
    self._take_sample = self._make_take_sample()
    self._switch = self._make_swith_to_average()
    self._restore = self._make_restore()
    self._reset = self._make_reset() 
Example #21
Source File: model_fns.py    From language with Apache License 2.0 5 votes vote down vote up
def _get_bert_embeddings(model, layers_to_use, aggregation_fn, name="bert"):
  """Extract embeddings from BERT model."""
  all_hidden = model.get_all_encoder_layers()
  layers_hidden = [all_hidden[i] for i in layers_to_use]
  hidden_shapes = [
      modeling.get_shape_list(hid, expected_rank=3) for hid in all_hidden
  ]

  if len(layers_hidden) == 1:
    hidden_emb = layers_hidden[0]
    hidden_size = hidden_shapes[0][2]
  elif aggregation_fn == "concat":
    hidden_emb = tf.concat(layers_hidden, 2)
    hidden_size = sum([hidden_shapes[i][2] for i in layers_to_use])
  elif aggregation_fn == "average":
    hidden_size = hidden_shapes[0][2]
    assert all([shape[2] == hidden_size for shape in hidden_shapes
               ]), hidden_shapes
    hidden_emb = tf.add_n(layers_hidden) / len(layers_hidden)
  elif aggregation_fn == "attention":
    hidden_size = hidden_shapes[0][2]
    mixing_weights = tf.get_variable(
        name + "/mixing/weights", [len(layers_hidden)],
        initializer=tf.zeros_initializer())
    mixing_scores = tf.nn.softmax(mixing_weights)
    hidden_emb = tf.tensordot(
        tf.stack(layers_hidden, axis=-1), mixing_scores, [[-1], [0]])
  else:
    raise ValueError("Unrecognized aggregation function %s." % aggregation_fn)

  return hidden_emb, hidden_size 
Example #22
Source File: run_dualencoder_qa.py    From language with Apache License 2.0 5 votes vote down vote up
def _get_bert_embeddings(model, layers_to_use, aggregation_fn, name="bert"):
  """Extract embeddings from BERT model."""
  all_hidden = model.get_all_encoder_layers()
  layers_hidden = [all_hidden[i] for i in layers_to_use]
  hidden_shapes = [
      modeling.get_shape_list(hid, expected_rank=3) for hid in all_hidden
  ]

  if len(layers_hidden) == 1:
    hidden_emb = layers_hidden[0]
    hidden_size = hidden_shapes[0][2]
  elif aggregation_fn == "concat":
    hidden_emb = tf.concat(layers_hidden, 2)
    hidden_size = sum([hidden_shapes[i][2] for i in layers_to_use])
  elif aggregation_fn == "average":
    hidden_size = hidden_shapes[0][2]
    assert all([shape[2] == hidden_size for shape in hidden_shapes
               ]), hidden_shapes
    hidden_emb = tf.add_n(layers_hidden) / len(layers_hidden)
  elif aggregation_fn == "attention":
    hidden_size = hidden_shapes[0][2]
    mixing_weights = tf.get_variable(
        name + "/mixing/weights", [len(layers_hidden)],
        initializer=tf.zeros_initializer())
    mixing_scores = tf.nn.softmax(mixing_weights)
    hidden_emb = tf.tensordot(
        tf.stack(layers_hidden, axis=-1), mixing_scores, [[-1], [0]])
  else:
    raise ValueError("Unrecognized aggregation function %s." % aggregation_fn)

  return hidden_emb, hidden_size 
Example #23
Source File: modeling.py    From training with Apache License 2.0 5 votes vote down vote up
def dense_layer_3d(input_tensor,
                   num_attention_heads,
                   size_per_head,
                   initializer,
                   activation,
                   name=None):
  """A dense layer with 3D kernel.

  Args:
    input_tensor: float Tensor of shape [batch, seq_length, hidden_size].
    num_attention_heads: Number of attention heads.
    size_per_head: The size per attention head.
    initializer: Kernel initializer.
    activation: Actication function.
    name: The name scope of this layer.

  Returns:
    float logits Tensor.
  """

  last_dim = get_shape_list(input_tensor)[-1]

  with tf.variable_scope(name):
    w = tf.get_variable(
        name="kernel",
        shape=[last_dim, num_attention_heads * size_per_head],
        initializer=initializer)
    w = tf.reshape(w, [last_dim, num_attention_heads, size_per_head])
    b = tf.get_variable(
        name="bias",
        shape=[num_attention_heads * size_per_head],
        initializer=tf.zeros_initializer)
    b = tf.reshape(b, [num_attention_heads, size_per_head])
    ret = tf.einsum("abc,cde->abde", input_tensor, w)
    ret += b
    if activation is not None:
      return activation(ret)
    else:
      return ret 
Example #24
Source File: ops.py    From language with Apache License 2.0 5 votes vote down vote up
def affine(x, output_size, weight_name, bias_name=None, weight_init=None):
  """Affine transformation of the input `x`.

  Args:
    x: <float32>[..., x_dim]
    output_size: size of the last output dimension
    weight_name: Name of the weight variable to use
    bias_name: Name for the bias variable, if one should be used
    weight_init: Initializer of the weight variable

  Returns:
    transformed <float32>[..., `output_size`]
  """
  dim = x.shape.as_list()[-1]
  w = tf.get_variable(
      weight_name, (dim, output_size), tf.float32, initializer=weight_init)
  out = tf.tensordot(x, w, [[len(x.shape) - 1], [0]])
  if bias_name:
    b = tf.get_variable(
        bias_name, (output_size,),
        tf.float32,
        initializer=tf.zeros_initializer())
    for _ in range(len(out.shape) - 1):
      b = tf.expand_dims(b, 0)
    out += b
  return out 
Example #25
Source File: run_squad.py    From language with Apache License 2.0 5 votes vote down vote up
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 use_one_hot_embeddings):
  """Creates a classification model."""
  model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids,
      use_one_hot_embeddings=use_one_hot_embeddings)

  final_hidden = model.get_sequence_output()

  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
  batch_size = final_hidden_shape[0]
  seq_length = final_hidden_shape[1]
  hidden_size = final_hidden_shape[2]

  output_weights = tf.get_variable(
      "cls/squad/output_weights", [2, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())

  final_hidden_matrix = tf.reshape(final_hidden,
                                   [batch_size * seq_length, hidden_size])
  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
  logits = tf.nn.bias_add(logits, output_bias)

  logits = tf.reshape(logits, [batch_size, seq_length, 2])
  logits = tf.transpose(logits, [2, 0, 1])

  unstacked_logits = tf.unstack(logits, axis=0)

  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])

  return (start_logits, end_logits) 
Example #26
Source File: bert.py    From mesh with Apache License 2.0 5 votes vote down vote up
def get_masked_lm_output(self, positions, label_ids, label_weights):
    """Get loss and logits for the masked LM."""
    input_tensor = self.get_sequence_output()
    output_weights = self.get_embedding_table()

    # [batch_size, num_position, hidden]
    input_tensor = mtf.gather(input_tensor, positions, self.seq_dim)
    with tf.variable_scope("cls/predictions"):
      # We apply one more non-linear transformation before the output layer.
      # This matrix is not used after pre-training.
      with tf.variable_scope("transform"):
        input_tensor = mtf.layers.dense(
            input_tensor,
            reduced_dims=[self.model_dim],
            new_dims=[self.model_dim],
            activation=get_activation(self.config.feedforward_intermediate_act),
            kernel_initializer=self.dense_initializer,
            use_bias=self.config.use_bias)
        input_tensor = self.normalize(input_tensor)
      # The output weights are the same as the input embeddings, but there is
      # an output-only bias for each token.
      output_bias = mtf.get_variable(
          input_tensor.mesh,
          name="output_bias",
          shape=[self.vocab_dim],
          initializer=tf.zeros_initializer())
      logits = mtf.einsum([input_tensor, output_weights],
                          reduced_dims=[self.model_dim]) + output_bias
      per_example_loss = mtf.layers.softmax_cross_entropy_with_logits(
          logits, label_ids, self.vocab_dim, z_loss=1e-4)
      # The `positions` tensor might be zero-padded (if the sequence is too
      # short to have the maximum number of predictions). The `label_weights`
      # tensor has a value of 1.0 for every real prediction and 0.0 for the
      # padding predictions.
      numerator = mtf.reduce_sum(label_weights * per_example_loss)
      denominator = mtf.reduce_sum(label_weights) + mtf.constant(
          input_tensor.mesh, 1e-5, dtype=tf.float32)
      loss = numerator / denominator
    return (loss, per_example_loss, logits) 
Example #27
Source File: layers.py    From mesh with Apache License 2.0 5 votes vote down vote up
def layer_norm(x, dim, epsilon=1e-6, name="layer_prepostprocess"):
  """Layer normalization over dimension dim.

  Args:
    x: a mtf.Tensor whose shape contains dim.
    dim: a mtf.Dimension
    epsilon: a floating point number
    name: a string used for tf.variable_scope.

  Returns:
    a mtf.Tensor with same shape as x.
  """
  with tf.variable_scope(name + "/layer_norm"):
    scale = mtf.get_variable(
        x.mesh,
        "layer_norm_scale",
        mtf.Shape([dim]),
        initializer=tf.ones_initializer(),
        activation_dtype=x.dtype)
    bias = mtf.get_variable(
        x.mesh,
        "layer_norm_bias",
        mtf.Shape([dim]),
        initializer=tf.zeros_initializer(),
        activation_dtype=x.dtype)
    reduced_shape = x.shape - dim
    mean = mtf.reduce_mean(x, output_shape=reduced_shape)
    variance = mtf.reduce_mean(mtf.square(x - mean), output_shape=reduced_shape)
    norm_x = (x - mean) * mtf.rsqrt(variance + epsilon)
    return norm_x * scale + bias 
Example #28
Source File: optimize.py    From mesh with Apache License 2.0 5 votes vote down vote up
def apply_grad(self, grad, var):
    if grad is None:
      tf.logging.warning("Gradient is None for variable %s" % var.name)
      return []

    updates = []
    v = mtf.get_variable(
        var.mesh, var.name + "_momentum_v", var.shape,
        dtype=var.dtype, initializer=tf.zeros_initializer(), trainable=False)

    with tf.variable_scope(var.name + "/sgd_momentum"):
      updates.append(mtf.assign(v, grad * self.lr + v * self.momentum))
      updates.append(mtf.assign_sub(var, v))

    return updates 
Example #29
Source File: model.py    From interval-bound-propagation with Apache License 2.0 5 votes vote down vote up
def _create_linear_initializer(input_size, output_size, dtype=tf.float32):  # pylint: disable=unused-argument
  """Returns a default initializer for the weights of a linear module."""
  return {
      'w': tf.orthogonal_initializer(),
      'b': tf.zeros_initializer(dtype=dtype),
  } 
Example #30
Source File: convnet_builder.py    From benchmarks with Apache License 2.0 5 votes vote down vote up
def _batch_norm_without_layers(self, input_layer, decay, use_scale, epsilon):
    """Batch normalization on `input_layer` without tf.layers."""
    # We make this function as similar as possible to the
    # tf.contrib.layers.batch_norm, to minimize the differences between using
    # layers and not using layers.
    shape = input_layer.shape
    num_channels = shape[3] if self.data_format == 'NHWC' else shape[1]
    beta = self.get_variable('beta', [num_channels], tf.float32, tf.float32,
                             initializer=tf.zeros_initializer())
    if use_scale:
      gamma = self.get_variable('gamma', [num_channels], tf.float32,
                                tf.float32, initializer=tf.ones_initializer())
    else:
      gamma = tf.constant(1.0, tf.float32, [num_channels])
    # For moving variables, we use tf.get_variable instead of self.get_variable,
    # since self.get_variable returns the result of tf.cast which we cannot
    # assign to.
    moving_mean = tf.get_variable('moving_mean', [num_channels],
                                  tf.float32,
                                  initializer=tf.zeros_initializer(),
                                  trainable=False)
    moving_variance = tf.get_variable('moving_variance', [num_channels],
                                      tf.float32,
                                      initializer=tf.ones_initializer(),
                                      trainable=False)
    if self.phase_train:
      bn, batch_mean, batch_variance = tf.nn.fused_batch_norm(
          input_layer, gamma, beta, epsilon=epsilon,
          data_format=self.data_format, is_training=True)
      mean_update = moving_averages.assign_moving_average(
          moving_mean, batch_mean, decay=decay, zero_debias=False)
      variance_update = moving_averages.assign_moving_average(
          moving_variance, batch_variance, decay=decay, zero_debias=False)
      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, mean_update)
      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, variance_update)
    else:
      bn, _, _ = tf.nn.fused_batch_norm(
          input_layer, gamma, beta, mean=moving_mean,
          variance=moving_variance, epsilon=epsilon,
          data_format=self.data_format, is_training=False)
    return bn