Python tensorflow.python.ops.linalg_ops.norm() Examples

The following are 10 code examples of tensorflow.python.ops.linalg_ops.norm(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow.python.ops.linalg_ops , or try the search function .
Example #1
Source File: von_mises_fisher.py    From s-vae-tf with MIT License 5 votes vote down vote up
def __init__(self, loc, scale, validate_args=False, allow_nan_stats=True, name="von-Mises-Fisher"):
        """Construct von-Mises-Fisher distributions with mean and concentration `loc` and `scale`.

        Args:
          loc: Floating point tensor; the mean of the distribution(s).
          scale: Floating point tensor; the concentration of the distribution(s).
            Must contain only non-negative values.
          validate_args: Python `bool`, default `False`. When `True` distribution
            parameters are checked for validity despite possibly degrading runtime
            performance. When `False` invalid inputs may silently render incorrect
            outputs.
          allow_nan_stats: Python `bool`, default `True`. When `True`,
            statistics (e.g., mean, mode, variance) use the value "`NaN`" to
            indicate the result is undefined. When `False`, an exception is raised
            if one or more of the statistic's batch members are undefined.
          name: Python `str` name prefixed to Ops created by this class.

        Raises:
          TypeError: if `loc` and `scale` have different `dtype`.
        """
        parameters = locals()
        with ops.name_scope(name, values=[loc, scale]):
            with ops.control_dependencies([check_ops.assert_positive(scale),
                                           check_ops.assert_near(linalg_ops.norm(loc, axis=-1), 1, atol=1e-7)]
                                          if validate_args else []):
                self._loc = array_ops.identity(loc, name="loc")
                self._scale = array_ops.identity(scale, name="scale")
                check_ops.assert_same_float_dtype([self._loc, self._scale])

        super(VonMisesFisher, self).__init__(
            dtype=self._scale.dtype,
            reparameterization_type=distribution.FULLY_REPARAMETERIZED,
            validate_args=validate_args,
            allow_nan_stats=allow_nan_stats,
            parameters=parameters,
            graph_parents=[self._loc, self._scale],
            name=name)

        self.__m = math_ops.cast(self._loc.shape[-1], dtypes.int32)
        self.__mf = math_ops.cast(self.__m, dtype=self.dtype)
        self.__e1 = array_ops.one_hot([0], self.__m, dtype=self.dtype) 
Example #2
Source File: von_mises_fisher.py    From s-vae-tf with MIT License 5 votes vote down vote up
def _log_unnormalized_prob(self, x):
        with ops.control_dependencies(
                [check_ops.assert_near(linalg_ops.norm(x, axis=-1), 1, atol=1e-3)] if self.validate_args else []):
            output = self.scale * math_ops.reduce_sum(self._loc * x, axis=-1, keepdims=True)

        return array_ops.reshape(output, ops.convert_to_tensor(array_ops.shape(output)[:-1])) 
Example #3
Source File: temporal_convolutional_network.py    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def _compute_weights(self):
        """Generate weights by combining the direction of weight vector
         with it's norm """
        with variable_scope.variable_scope("compute_weights"):
            self.layer.kernel = (
                nn_impl.l2_normalize(self.layer.v, axis=self.norm_axes) * self.layer.g
            ) 
Example #4
Source File: temporal_convolutional_network.py    From nlp-architect with Apache License 2.0 5 votes vote down vote up
def _init_norm(self, weights):
        """Set the norm of the weight vector"""
        from tensorflow.python.ops.linalg_ops import norm

        with variable_scope.variable_scope("init_norm"):
            # pylint: disable=no-member
            flat = array_ops.reshape(weights, [-1, self.layer_depth])
            # pylint: disable=no-member
            return array_ops.reshape(norm(flat, axis=0), (self.layer_depth,)) 
Example #5
Source File: egdd.py    From lingvo with Apache License 2.0 4 votes vote down vote up
def _apply_dense(self, grad, var):
    lr_scale = self.get_slot(var, "lr_scale")
    momentum = self.get_slot(var, "momentum")
    gbar = self.get_slot(var, "gbar")
    gain = self.get_slot(var, "gain")
    counter = self.get_slot(var, "counter")
    counter_updated = state_ops.assign(counter, counter + 1)

    # lr_scale update uses normalized grad and momentum to be independent of dim
    normalized_grad = grad / (linalg_ops.norm(grad) + 1e-10)
    normalized_momentum = momentum / (linalg_ops.norm(momentum) + 1e-10)
    # Apply EG updates on lr_scale:
    # grad_lr_scale = -inner_product(current_grad, old_momentum)
    # lr_scale <- lr_scale * exp(-scale_learning_rate * grad_lr_scale)
    lr_scale_unnormalized_updated = clip_ops.clip_by_value(
        lr_scale * math_ops.exp(
            self._scale_learning_rate * math_ops.reduce_sum(grad * momentum)),
        self._min_scale, self._max_scale)
    lr_scale_normalized_updated = clip_ops.clip_by_value(
        lr_scale * math_ops.exp(self._scale_learning_rate * math_ops.reduce_sum(
            normalized_grad * normalized_momentum)), self._min_scale,
        self._max_scale)
    lr_scale_updated = state_ops.assign(
        lr_scale,
        array_ops.where(self._use_directions, lr_scale_normalized_updated,
                        lr_scale_unnormalized_updated))
    # remove the bias of zero initialization in gbar
    corrected_gbar = gbar / (
        1.0 - self._beta**math_ops.maximum(counter_updated - 1, 1))
    # Apply EG updates on gain:
    # grad_gain = - current_grad * old_gbar
    # gain <- gain * exp(-gain_learning_rate * grad_gain)
    gain_unnormalized_updated = clip_ops.clip_by_value(
        gain * math_ops.exp(self._gain_learning_rate * grad * corrected_gbar),
        self._min_gain, self._max_gain)
    # Normalized update uses sign(grad) * sign(gbar) as a proxy for grad_gain.
    gain_normalized_updated = clip_ops.clip_by_value(
        gain * math_ops.exp(self._gain_learning_rate * math_ops.sign(grad) *
                            math_ops.sign(gbar)), self._min_gain,
        self._max_gain)
    gain_updated = state_ops.assign(
        gain,
        array_ops.where(self._use_signs, gain_normalized_updated,
                        gain_unnormalized_updated))
    scaled_g = self._learning_rate_tensor * gain_updated * grad
    with ops.control_dependencies([lr_scale_updated, scaled_g]):
      momentum_updated = state_ops.assign(
          momentum, self._momentum_tensor * momentum + scaled_g)
      gbar_updated = state_ops.assign(
          gbar, self._beta * gbar + (1.0 - self._beta) * grad)
    with ops.control_dependencies([gbar_updated]):
      return state_ops.assign_sub(var, lr_scale_updated * momentum_updated) 
Example #6
Source File: lamb_optimizer.py    From albert with Apache License 2.0 4 votes vote down vote up
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """See base class."""
    assignments = []
    for (grad, param) in grads_and_vars:
      if grad is None or param is None:
        continue

      param_name = self._get_variable_name(param.name)

      m = tf.get_variable(
          name=six.ensure_str(param_name) + "/adam_m",
          shape=param.shape.as_list(),
          dtype=tf.float32,
          trainable=False,
          initializer=tf.zeros_initializer())
      v = tf.get_variable(
          name=six.ensure_str(param_name) + "/adam_v",
          shape=param.shape.as_list(),
          dtype=tf.float32,
          trainable=False,
          initializer=tf.zeros_initializer())

      # Standard Adam update.
      next_m = (
          tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
      next_v = (
          tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
                                                    tf.square(grad)))

      update = next_m / (tf.sqrt(next_v) + self.epsilon)

      # Just adding the square of the weights to the loss function is *not*
      # the correct way of using L2 regularization/weight decay with Adam,
      # since that will interact with the m and v parameters in strange ways.
      #
      # Instead we want ot decay the weights in a manner that doesn't interact
      # with the m/v parameters. This is equivalent to adding the square
      # of the weights to the loss with plain (non-momentum) SGD.
      if self._do_use_weight_decay(param_name):
        update += self.weight_decay_rate * param

      ratio = 1.0
      if self._do_layer_adaptation(param_name):
        w_norm = linalg_ops.norm(param, ord=2)
        g_norm = linalg_ops.norm(update, ord=2)
        ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where(
            math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0)

      update_with_lr = ratio * self.learning_rate * update

      next_param = param - update_with_lr

      assignments.extend(
          [param.assign(next_param),
           m.assign(next_m),
           v.assign(next_v)])
    return tf.group(*assignments, name=name) 
Example #7
Source File: lamb_optimizer_v1.py    From training with Apache License 2.0 4 votes vote down vote up
def _resource_apply_dense(self, grad, var):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
                                        var.dtype.base_dtype)
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = m * beta1_t + m_scaled_g_values
    m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = v * beta2_t + v_scaled_g_values
    v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(var.name)
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),
          1.0)

    var_update = var - ratio * lr_t * update
    return state_ops.assign(var, var_update, use_locking=self._use_locking).op 
Example #8
Source File: lamb_optimizer_v1.py    From training with Apache License 2.0 4 votes vote down vote up
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
                                        var.dtype.base_dtype)
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = scatter_add(m, indices, m_scaled_g_values)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
    with ops.control_dependencies([v_t]):
      v_t = scatter_add(v, indices, v_scaled_g_values)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(var.name)
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),
          1.0)
    var_update = state_ops.assign_sub(
        var, ratio * lr_t * update, use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, m_t, v_t]) 
Example #9
Source File: lamb_optimizer_v1.py    From training with Apache License 2.0 4 votes vote down vote up
def _resource_apply_dense(self, grad, var):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
                                        var.dtype.base_dtype)
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = m * beta1_t + m_scaled_g_values
    m_t = state_ops.assign(m, m_t, use_locking=self._use_locking)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = v * beta2_t + v_scaled_g_values
    v_t = state_ops.assign(v, v_t, use_locking=self._use_locking)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(var.name)
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),
          1.0)

    var_update = var - ratio * lr_t * update
    return state_ops.assign(var, var_update, use_locking=self._use_locking).op 
Example #10
Source File: lamb_optimizer_v1.py    From training with Apache License 2.0 4 votes vote down vote up
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
    beta1_power, beta2_power = self._get_beta_accumulators()
    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t,
                                        var.dtype.base_dtype)
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = scatter_add(m, indices, m_scaled_g_values)
    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, "v")
    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
    with ops.control_dependencies([v_t]):
      v_t = scatter_add(v, indices, v_scaled_g_values)

    # ==== The following is with m_t_hat and v_t_hat
    m_t_hat = m_t / (1. - beta1_power)
    v_t_hat = v_t / (1. - beta2_power)

    v_sqrt = math_ops.sqrt(v_t_hat)
    update = m_t_hat / (v_sqrt + epsilon_t)

    # ==== The following is the original LAMBOptimizer implementation
    # v_sqrt = math_ops.sqrt(v_t_hat)
    # update = m_t / (v_sqrt + epsilon_t)

    var_name = self._get_variable_name(var.name)
    if self._do_use_weight_decay(var_name):
      update += weight_decay_rate_t * var

    ratio = 1.0
    if self._do_layer_adaptation(var_name):
      w_norm = linalg_ops.norm(var, ord=2)
      g_norm = linalg_ops.norm(update, ord=2)
      ratio = array_ops.where(
          math_ops.greater(w_norm, 0),
          array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0),
          1.0)
    var_update = state_ops.assign_sub(
        var, ratio * lr_t * update, use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, m_t, v_t])