Python tensorflow.python.ops.linalg_ops.norm() Examples

The following are 10 code examples of tensorflow.python.ops.linalg_ops.norm().
Example #1
Source File:    From s-vae-tf with MIT License 5 votes vote down vote up
def __init__(self, loc, scale, validate_args=False, allow_nan_stats=True, name="von-Mises-Fisher"):
        """Construct von-Mises-Fisher distributions with mean and concentration `loc` and `scale`.

          loc: Floating point tensor; the mean of the distribution(s).
          scale: Floating point tensor; the concentration of the distribution(s).
            Must contain only non-negative values.
          validate_args: Python `bool`, default `False`. When `True` distribution
            parameters are checked for validity despite possibly degrading runtime
            performance. When `False` invalid inputs may silently render incorrect
          allow_nan_stats: Python `bool`, default `True`. When `True`,
            statistics (e.g., mean, mode, variance) use the value "`NaN`" to
            indicate the result is undefined. When `False`, an exception is raised
            if one or more of the statistic's batch members are undefined.
          name: Python `str` name prefixed to Ops created by this class.

          TypeError: if `loc` and `scale` have different `dtype`.
        parameters = locals()
        with ops.name_scope(name, values=[loc, scale]):
            with ops.control_dependencies([check_ops.assert_positive(scale),
                                           check_ops.assert_near(linalg_ops.norm(loc, axis=-1), 1, atol=1e-7)]
                                          if validate_args else []):
                self._loc = array_ops.identity(loc, name="loc")
                self._scale = array_ops.identity(scale, name="scale")
                check_ops.assert_same_float_dtype([self._loc, self._scale])

        super(VonMisesFisher, self).__init__(
            graph_parents=[self._loc, self._scale],

        self.__m = math_ops.cast(self._loc.shape[-1], dtypes.int32)
        self.__mf = math_ops.cast(self.__m, dtype=self.dtype)
        self.__e1 = array_ops.one_hot([0], self.__m, dtype=self.dtype) 
Example #2
Source File:    From s-vae-tf with MIT License 5 votes vote down vote up
def _log_unnormalized_prob(self, x):
        with ops.control_dependencies(
                [check_ops.assert_near(linalg_ops.norm(x, axis=-1), 1, atol=1e-3)] if self.validate_args else []):
            output = self.scale * math_ops.reduce_sum(self._loc * x, axis=-1, keepdims=True)

        return array_ops.reshape(output, ops.convert_to_tensor(array_ops.shape(output)[:-1])) 
Example #3
Source File:    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def _compute_weights(self):
        """Generate weights by combining the direction of weight vector
         with it's norm """
        with variable_scope.variable_scope("compute_weights"):
            self.layer.kernel = (
                nn_impl.l2_normalize(self.layer.v, axis=self.norm_axes) * self.layer.g
Example #4
Source File:    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def _init_norm(self, weights):
        """Set the norm of the weight vector"""
        from tensorflow.python.ops.linalg_ops import norm

        with variable_scope.variable_scope("init_norm"):
            # pylint: disable=no-member
            flat = array_ops.reshape(weights, [-1, self.layer_depth])
            # pylint: disable=no-member
            return array_ops.reshape(norm(flat, axis=0), (self.layer_depth,)) 
Example #5
Source File:    From lingvo with Apache License 2.0 4 votes vote down vote up
def _apply_dense(self, grad, var):
    lr_scale = self.get_slot(var, "lr_scale")
    momentum = self.get_slot(var, "momentum")
    gbar = self.get_slot(var, "gbar")
    gain = self.get_slot(var, "gain")
    counter = self.get_slot(var, "counter")
    counter_updated = state_ops.assign(counter, counter + 1)

    # lr_scale update uses normalized grad and momentum to be independent of dim
    normalized_grad = grad / (linalg_ops.norm(grad) + 1e-10)
    normalized_momentum = momentum / (linalg_ops.norm(momentum) + 1e-10)
    # Apply EG updates on lr_scale:
    # grad_lr_scale = -inner_product(current_grad, old_momentum)
    # lr_scale <- lr_scale * exp(-scale_learning_rate * grad_lr_scale)
    lr_scale_unnormalized_updated = clip_ops.clip_by_value(
        lr_scale * math_ops.exp(
            self._scale_learning_rate * math_ops.reduce_sum(grad * momentum)),
        self._min_scale, self._max_scale)
    lr_scale_normalized_updated = clip_ops.clip_by_value(
        lr_scale * math_ops.exp(self._scale_learning_rate * math_ops.reduce_sum(
            normalized_grad * normalized_momentum)), self._min_scale,
    lr_scale_updated = state_ops.assign(
        array_ops.where(self._use_directions, lr_scale_normalized_updated,
    # remove the bias of zero initialization in gbar
    corrected_gbar = gbar / (
        1.0 - self._beta**math_ops.maximum(counter_updated - 1, 1))
    # Apply EG updates on gain:
    # grad_gain = - current_grad * old_gbar
    # gain <- gain * exp(-gain_learning_rate * grad_gain)
    gain_unnormalized_updated = clip_ops.clip_by_value(
        gain * math_ops.exp(self._gain_learning_rate * grad * corrected_gbar),
        self._min_gain, self._max_gain)
    # Normalized update uses sign(grad) * sign(gbar) as a proxy for grad_gain.
    gain_normalized_updated = clip_ops.clip_by_value(
        gain * math_ops.exp(self._gain_learning_rate * math_ops.sign(grad) *
                            math_ops.sign(gbar)), self._min_gain,
    gain_updated = state_ops.assign(
        array_ops.where(self._use_signs, gain_normalized_updated,
    scaled_g = self._learning_rate_tensor * gain_updated * grad
    with ops.control_dependencies([lr_scale_updated, scaled_g]):
      momentum_updated = state_ops.assign(
          momentum, self._momentum_tensor * momentum + scaled_g)
      gbar_updated = state_ops.assign(
          gbar, self._beta * gbar + (1.0 - self._beta) * grad)
    with ops.control_dependencies([gbar_updated]):
      return state_ops.assign_sub(var, lr_scale_updated * momentum_updated) 
Example #6
Source File:    From albert with Apache License 2.0 4 votes vote down vote up
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """See base class."""
    assignments = []
    for (grad, param) in grads_and_vars:
      if grad is None or param is None:

      param_name = self._get_variable_name(

      m = tf.get_variable(
          name=six.ensure_str(param_name) + "/adam_m",
      v = tf.get_variable(
          name=six.ensure_str(param_name) + "/adam_v",

      # Standard Adam update.
      next_m = (
          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
      next_v = (
          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,

      update = next_m / (tf.sqrt(next_v) + self.epsilon)

      # Just adding the square of the weights to the loss function is *not*
      # the correct way of using L2 regularization/weight decay with Adam,
      # since that will interact with the m and v parameters in strange ways.
      # Instead we want ot decay the weights in a manner that doesn't interact
      # with the m/v parameters. This is equivalent to adding the square
      # of the weights to the loss with plain (non-momentum) SGD.
      if self._do_use_weight_decay(param_name):
        update += self.weight_decay_rate * param

      ratio = 1.0
      if self._do_layer_adaptation(param_name):
        w_norm = linalg_ops.norm(param, ord=2)
        g_norm = linalg_ops.norm(update, ord=2)
        ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
            math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)

      update_with_lr = ratio * self.learning_rate * update

      next_param = param - update_with_lr

    return*assignments, name=name) 
Example #7
Source File:    From training with Apache License 2.0 4 votes vote down vote up
def _resource_apply_dense(self, grad, var):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = m * beta1_t + m_scaled_g_values
    m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = v * beta2_t + v_scaled_g_values
    v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),

    var_update = var - ratio * lr_t * update
    return state_ops.assign(var, var_update, use_locking=self._use_locking).op 
Example #8
Source File:    From training with Apache License 2.0 4 votes vote down vote up
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = scatter_add(m, indices, m_scaled_g_values)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
    with ops.control_dependencies([v_t]):
      v_t = scatter_add(v, indices, v_scaled_g_values)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),
    var_update = state_ops.assign_sub(
        var, ratio * lr_t * update, use_locking=self._use_locking)
    return*[var_update, m_t, v_t]) 
Example #9
Source File:    From training with Apache License 2.0 4 votes vote down vote up
def _resource_apply_dense(self, grad, var):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = m * beta1_t + m_scaled_g_values
    m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = v * beta2_t + v_scaled_g_values
    v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),

    var_update = var - ratio * lr_t * update
    return state_ops.assign(var, var_update, use_locking=self._use_locking).op 
Example #10
Source File:    From training with Apache License 2.0 4 votes vote down vote up
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = scatter_add(m, indices, m_scaled_g_values)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
    with ops.control_dependencies([v_t]):
      v_t = scatter_add(v, indices, v_scaled_g_values)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),
    var_update = state_ops.assign_sub(
        var, ratio * lr_t * update, use_locking=self._use_locking)
    return*[var_update, m_t, v_t])