Python tensorflow.python.ops.linalg_ops.norm() Examples
The following are 10
code examples of tensorflow.python.ops.linalg_ops.norm().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.ops.linalg_ops
, or try the search function
.
Example #1
Source File: von_mises_fisher.py From s-vae-tf with MIT License | 5 votes |
def __init__(self, loc, scale, validate_args=False, allow_nan_stats=True, name="von-Mises-Fisher"): """Construct von-Mises-Fisher distributions with mean and concentration `loc` and `scale`. Args: loc: Floating point tensor; the mean of the distribution(s). scale: Floating point tensor; the concentration of the distribution(s). Must contain only non-negative values. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: TypeError: if `loc` and `scale` have different `dtype`. """ parameters = locals() with ops.name_scope(name, values=[loc, scale]): with ops.control_dependencies([check_ops.assert_positive(scale), check_ops.assert_near(linalg_ops.norm(loc, axis=-1), 1, atol=1e-7)] if validate_args else []): self._loc = array_ops.identity(loc, name="loc") self._scale = array_ops.identity(scale, name="scale") check_ops.assert_same_float_dtype([self._loc, self._scale]) super(VonMisesFisher, self).__init__( dtype=self._scale.dtype, reparameterization_type=distribution.FULLY_REPARAMETERIZED, validate_args=validate_args, allow_nan_stats=allow_nan_stats, parameters=parameters, graph_parents=[self._loc, self._scale], name=name) self.__m = math_ops.cast(self._loc.shape[-1], dtypes.int32) self.__mf = math_ops.cast(self.__m, dtype=self.dtype) self.__e1 = array_ops.one_hot([0], self.__m, dtype=self.dtype)
Example #2
Source File: von_mises_fisher.py From s-vae-tf with MIT License | 5 votes |
def _log_unnormalized_prob(self, x): with ops.control_dependencies( [check_ops.assert_near(linalg_ops.norm(x, axis=-1), 1, atol=1e-3)] if self.validate_args else []): output = self.scale * math_ops.reduce_sum(self._loc * x, axis=-1, keepdims=True) return array_ops.reshape(output, ops.convert_to_tensor(array_ops.shape(output)[:-1]))
Example #3
Source File: temporal_convolutional_network.py From nlp-architect with Apache License 2.0 | 5 votes |
def _compute_weights(self): """Generate weights by combining the direction of weight vector with it's norm """ with variable_scope.variable_scope("compute_weights"): self.layer.kernel = ( nn_impl.l2_normalize(self.layer.v, axis=self.norm_axes) * self.layer.g )
Example #4
Source File: temporal_convolutional_network.py From nlp-architect with Apache License 2.0 | 5 votes |
def _init_norm(self, weights): """Set the norm of the weight vector""" from tensorflow.python.ops.linalg_ops import norm with variable_scope.variable_scope("init_norm"): # pylint: disable=no-member flat = array_ops.reshape(weights, [-1, self.layer_depth]) # pylint: disable=no-member return array_ops.reshape(norm(flat, axis=0), (self.layer_depth,))
Example #5
Source File: egdd.py From lingvo with Apache License 2.0 | 4 votes |
def _apply_dense(self, grad, var): lr_scale = self.get_slot(var, "lr_scale") momentum = self.get_slot(var, "momentum") gbar = self.get_slot(var, "gbar") gain = self.get_slot(var, "gain") counter = self.get_slot(var, "counter") counter_updated = state_ops.assign(counter, counter + 1) # lr_scale update uses normalized grad and momentum to be independent of dim normalized_grad = grad / (linalg_ops.norm(grad) + 1e-10) normalized_momentum = momentum / (linalg_ops.norm(momentum) + 1e-10) # Apply EG updates on lr_scale: # grad_lr_scale = -inner_product(current_grad, old_momentum) # lr_scale <- lr_scale * exp(-scale_learning_rate * grad_lr_scale) lr_scale_unnormalized_updated = clip_ops.clip_by_value( lr_scale * math_ops.exp( self._scale_learning_rate * math_ops.reduce_sum(grad * momentum)), self._min_scale, self._max_scale) lr_scale_normalized_updated = clip_ops.clip_by_value( lr_scale * math_ops.exp(self._scale_learning_rate * math_ops.reduce_sum( normalized_grad * normalized_momentum)), self._min_scale, self._max_scale) lr_scale_updated = state_ops.assign( lr_scale, array_ops.where(self._use_directions, lr_scale_normalized_updated, lr_scale_unnormalized_updated)) # remove the bias of zero initialization in gbar corrected_gbar = gbar / ( 1.0 - self._beta**math_ops.maximum(counter_updated - 1, 1)) # Apply EG updates on gain: # grad_gain = - current_grad * old_gbar # gain <- gain * exp(-gain_learning_rate * grad_gain) gain_unnormalized_updated = clip_ops.clip_by_value( gain * math_ops.exp(self._gain_learning_rate * grad * corrected_gbar), self._min_gain, self._max_gain) # Normalized update uses sign(grad) * sign(gbar) as a proxy for grad_gain. gain_normalized_updated = clip_ops.clip_by_value( gain * math_ops.exp(self._gain_learning_rate * math_ops.sign(grad) * math_ops.sign(gbar)), self._min_gain, self._max_gain) gain_updated = state_ops.assign( gain, array_ops.where(self._use_signs, gain_normalized_updated, gain_unnormalized_updated)) scaled_g = self._learning_rate_tensor * gain_updated * grad with ops.control_dependencies([lr_scale_updated, scaled_g]): momentum_updated = state_ops.assign( momentum, self._momentum_tensor * momentum + scaled_g) gbar_updated = state_ops.assign( gbar, self._beta * gbar + (1.0 - self._beta) * grad) with ops.control_dependencies([gbar_updated]): return state_ops.assign_sub(var, lr_scale_updated * momentum_updated)
Example #6
Source File: lamb_optimizer.py From albert with Apache License 2.0 | 4 votes |
def apply_gradients(self, grads_and_vars, global_step=None, name=None): """See base class.""" assignments = [] for (grad, param) in grads_and_vars: if grad is None or param is None: continue param_name = self._get_variable_name(param.name) m = tf.get_variable( name=six.ensure_str(param_name) + "/adam_m", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) v = tf.get_variable( name=six.ensure_str(param_name) + "/adam_v", shape=param.shape.as_list(), dtype=tf.float32, trainable=False, initializer=tf.zeros_initializer()) # Standard Adam update. next_m = ( tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) next_v = ( tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, tf.square(grad))) update = next_m / (tf.sqrt(next_v) + self.epsilon) # Just adding the square of the weights to the loss function is *not* # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # # Instead we want ot decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. if self._do_use_weight_decay(param_name): update += self.weight_decay_rate * param ratio = 1.0 if self._do_layer_adaptation(param_name): w_norm = linalg_ops.norm(param, ord=2) g_norm = linalg_ops.norm(update, ord=2) ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where( math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) update_with_lr = ratio * self.learning_rate * update next_param = param - update_with_lr assignments.extend( [param.assign(next_param), m.assign(next_m), v.assign(next_v)]) return tf.group(*assignments, name=name)
Example #7
Source File: lamb_optimizer_v1.py From training with Apache License 2.0 | 4 votes |
def _resource_apply_dense(self, grad, var): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t, var.dtype.base_dtype) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = m * beta1_t + m_scaled_g_values m_t = state_ops.assign(m, m_t, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = v * beta2_t + v_scaled_g_values v_t = state_ops.assign(v, v_t, use_locking=self._use_locking) # ==== The following is with m_t_hat and v_t_hat m_t_hat = m_t / (1. - beta1_power) v_t_hat = v_t / (1. - beta2_power) v_sqrt = math_ops.sqrt(v_t_hat) update = m_t_hat / (v_sqrt + epsilon_t) # ==== The following is the original LAMBOptimizer implementation # v_sqrt = math_ops.sqrt(v_t_hat) # update = m_t / (v_sqrt + epsilon_t) var_name = self._get_variable_name(var.name) if self._do_use_weight_decay(var_name): update += weight_decay_rate_t * var ratio = 1.0 if self._do_layer_adaptation(var_name): w_norm = linalg_ops.norm(var, ord=2) g_norm = linalg_ops.norm(update, ord=2) ratio = array_ops.where( math_ops.greater(w_norm, 0), array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) var_update = var - ratio * lr_t * update return state_ops.assign(var, var_update, use_locking=self._use_locking).op
Example #8
Source File: lamb_optimizer_v1.py From training with Apache License 2.0 | 4 votes |
def _apply_sparse_shared(self, grad, var, indices, scatter_add): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t, var.dtype.base_dtype) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) # ==== The following is with m_t_hat and v_t_hat m_t_hat = m_t / (1. - beta1_power) v_t_hat = v_t / (1. - beta2_power) v_sqrt = math_ops.sqrt(v_t_hat) update = m_t_hat / (v_sqrt + epsilon_t) # ==== The following is the original LAMBOptimizer implementation # v_sqrt = math_ops.sqrt(v_t_hat) # update = m_t / (v_sqrt + epsilon_t) var_name = self._get_variable_name(var.name) if self._do_use_weight_decay(var_name): update += weight_decay_rate_t * var ratio = 1.0 if self._do_layer_adaptation(var_name): w_norm = linalg_ops.norm(var, ord=2) g_norm = linalg_ops.norm(update, ord=2) ratio = array_ops.where( math_ops.greater(w_norm, 0), array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) var_update = state_ops.assign_sub( var, ratio * lr_t * update, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #9
Source File: lamb_optimizer_v1.py From training with Apache License 2.0 | 4 votes |
def _resource_apply_dense(self, grad, var): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t, var.dtype.base_dtype) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = m * beta1_t + m_scaled_g_values m_t = state_ops.assign(m, m_t, use_locking=self._use_locking) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = v * beta2_t + v_scaled_g_values v_t = state_ops.assign(v, v_t, use_locking=self._use_locking) # ==== The following is with m_t_hat and v_t_hat m_t_hat = m_t / (1. - beta1_power) v_t_hat = v_t / (1. - beta2_power) v_sqrt = math_ops.sqrt(v_t_hat) update = m_t_hat / (v_sqrt + epsilon_t) # ==== The following is the original LAMBOptimizer implementation # v_sqrt = math_ops.sqrt(v_t_hat) # update = m_t / (v_sqrt + epsilon_t) var_name = self._get_variable_name(var.name) if self._do_use_weight_decay(var_name): update += weight_decay_rate_t * var ratio = 1.0 if self._do_layer_adaptation(var_name): w_norm = linalg_ops.norm(var, ord=2) g_norm = linalg_ops.norm(update, ord=2) ratio = array_ops.where( math_ops.greater(w_norm, 0), array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) var_update = var - ratio * lr_t * update return state_ops.assign(var, var_update, use_locking=self._use_locking).op
Example #10
Source File: lamb_optimizer_v1.py From training with Apache License 2.0 | 4 votes |
def _apply_sparse_shared(self, grad, var, indices, scatter_add): beta1_power, beta2_power = self._get_beta_accumulators() beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) weight_decay_rate_t = math_ops.cast(self._weight_decay_rate_t, var.dtype.base_dtype) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad * (1 - beta1_t) m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) with ops.control_dependencies([m_t]): m_t = scatter_add(m, indices, m_scaled_g_values) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad * grad) * (1 - beta2_t) v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) with ops.control_dependencies([v_t]): v_t = scatter_add(v, indices, v_scaled_g_values) # ==== The following is with m_t_hat and v_t_hat m_t_hat = m_t / (1. - beta1_power) v_t_hat = v_t / (1. - beta2_power) v_sqrt = math_ops.sqrt(v_t_hat) update = m_t_hat / (v_sqrt + epsilon_t) # ==== The following is the original LAMBOptimizer implementation # v_sqrt = math_ops.sqrt(v_t_hat) # update = m_t / (v_sqrt + epsilon_t) var_name = self._get_variable_name(var.name) if self._do_use_weight_decay(var_name): update += weight_decay_rate_t * var ratio = 1.0 if self._do_layer_adaptation(var_name): w_norm = linalg_ops.norm(var, ord=2) g_norm = linalg_ops.norm(update, ord=2) ratio = array_ops.where( math_ops.greater(w_norm, 0), array_ops.where(math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) var_update = state_ops.assign_sub( var, ratio * lr_t * update, use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])