Python tensorflow.pow() Examples
The following are 30
code examples of tensorflow.pow().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow
, or try the search function
.
Example #1
Source File: losses.py From R2CNN_Faster-RCNN_Tensorflow with MIT License | 6 votes |
def _smooth_l1_loss_base(bbox_pred, bbox_targets, sigma=1.0): ''' :param bbox_pred: [-1, 4] in RPN. [-1, cls_num+1, 4] or [-1, cls_num+1, 5] in Fast-rcnn :param bbox_targets: shape is same as bbox_pred :param sigma: :return: ''' sigma_2 = sigma**2 box_diff = bbox_pred - bbox_targets abs_box_diff = tf.abs(box_diff) smoothL1_sign = tf.stop_gradient( tf.to_float(tf.less(abs_box_diff, 1. / sigma_2))) loss_box = tf.pow(box_diff, 2) * (sigma_2 / 2.0) * smoothL1_sign \ + (abs_box_diff - (0.5 / sigma_2)) * (1.0 - smoothL1_sign) return loss_box
Example #2
Source File: attacks_tf.py From neural-fingerprinting with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _apply_gradients(self, grads, x, optim_state): """Refer to parent class documentation.""" new_x = [None] * len(x) new_optim_state = { "t": optim_state["t"] + 1., "m": [None] * len(x), "u": [None] * len(x) } t = new_optim_state["t"] for i in xrange(len(x)): g = grads[i] m_old = optim_state["m"][i] u_old = optim_state["u"][i] new_optim_state["m"][i] = ( self._beta1 * m_old + (1. - self._beta1) * g) new_optim_state["u"][i] = ( self._beta2 * u_old + (1. - self._beta2) * g * g) m_hat = new_optim_state["m"][i] / (1. - tf.pow(self._beta1, t)) u_hat = new_optim_state["u"][i] / (1. - tf.pow(self._beta2, t)) new_x[i] = ( x[i] - self._lr * m_hat / (tf.sqrt(u_hat) + self._epsilon)) return new_x, new_optim_state
Example #3
Source File: ops.py From mac-network with Apache License 2.0 | 6 votes |
def locationPE(h, w, dim, outDim = -1, addBias = True): x = tf.expand_dims(tf.to_float(tf.linspace(-config.locationBias, config.locationBias, w)), axis = -1) y = tf.expand_dims(tf.to_float(tf.linspace(-config.locationBias, config.locationBias, h)), axis = -1) i = tf.expand_dims(tf.to_float(tf.range(dim)), axis = 0) peSinX = tf.sin(x / (tf.pow(10000.0, i / dim))) peCosX = tf.cos(x / (tf.pow(10000.0, i / dim))) peSinY = tf.sin(y / (tf.pow(10000.0, i / dim))) peCosY = tf.cos(y / (tf.pow(10000.0, i / dim))) peSinX = tf.tile(tf.expand_dims(peSinX, axis = 0), [h, 1, 1]) peCosX = tf.tile(tf.expand_dims(peCosX, axis = 0), [h, 1, 1]) peSinY = tf.tile(tf.expand_dims(peSinY, axis = 1), [1, w, 1]) peCosY = tf.tile(tf.expand_dims(peCosY, axis = 1), [1, w, 1]) grid = tf.concat([peSinX, peCosX, peSinY, peCosY], axis = -1) dim *= 4 if outDim > 0: grid = linear(grid, dim, outDim, addBias = addBias, name = "locationPE") dim = outDim return grid, dim
Example #4
Source File: common_attention.py From fine-lm with MIT License | 6 votes |
def scaled_dot_product_attention_simple(q, k, v, bias, name=None): """Scaled dot-product attention. One head. One spatial dimension. Args: q: a Tensor with shape [batch, length_q, depth_k] k: a Tensor with shape [batch, length_kv, depth_k] v: a Tensor with shape [batch, length_kv, depth_v] bias: optional Tensor broadcastable to [batch, length_q, length_kv] name: an optional string Returns: A Tensor. """ with tf.variable_scope( name, default_name="scaled_dot_product_attention_simple"): scalar = tf.rsqrt(tf.to_float(common_layers.shape_list(q)[2])) logits = tf.matmul(q * scalar, k, transpose_b=True) if bias is not None: logits += bias weights = tf.nn.softmax(logits, name="attention_weights") if common_layers.should_generate_summaries(): tf.summary.image( "attention", tf.expand_dims(tf.pow(weights, 0.2), 3), max_outputs=1) return tf.matmul(weights, v)
Example #5
Source File: tacotron.py From vae_tacotron with MIT License | 6 votes |
def add_loss(self, global_step): '''Adds loss to the model. Sets "loss" field. initialize must have been called.''' with tf.variable_scope('loss') as scope: hp = self._hparams self.mel_loss = tf.reduce_mean(tf.abs(self.mel_targets - self.mel_outputs)) l1 = tf.abs(self.linear_targets - self.linear_outputs) # Prioritize loss for frequencies under 3000 Hz. n_priority_freq = int(3000 / (hp.sample_rate * 0.5) * hp.num_freq) self.linear_loss = 0.5 * tf.reduce_mean(l1) + 0.5 * tf.reduce_mean(l1[:,:,0:n_priority_freq]) self.loss = self.mel_loss + self.linear_loss if hp.use_vae: # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) self.ki_loss = -0.5 * tf.reduce_sum(1 + self.log_var - tf.pow(self.mu, 2) - tf.exp(self.log_var)) vae_loss_weight = vae_weight(global_step) self.loss += self.ki_loss * vae_loss_weight
Example #6
Source File: layers.py From PADME with MIT License | 6 votes |
def create_tensor(self, in_layers=None, set_tensors=True, **kwargs): inputs = self._get_input_tensors(in_layers) temp = [] subspaces = [] # creates subspaces the same way it was done in AlphaShare for input_tensor in inputs: subspace_size = int(input_tensor.get_shape()[-1].value / 2) subspaces.append(input_tensor[:, :subspace_size]) subspaces.append(input_tensor[:, subspace_size:]) product = tf.matmul(tf.transpose(subspaces[0]), subspaces[1]) subspaces = [] # calculate squared Frobenius norm temp.append(tf.reduce_sum(tf.pow(product, 2))) out_tensor = tf.reduce_sum(temp) self.out_tensor = out_tensor return out_tensor
Example #7
Source File: losses.py From CapsLayer with Apache License 2.0 | 6 votes |
def spread_loss(labels, logits, margin, regularizer=None): """ Args: labels: [batch_size, num_label]. logits: [batch_size, num_label]. margin: Integer or 1-D Tensor. regularizer: use regularization. Returns: loss: Spread loss. """ a_target = cl.reduce_sum(labels * logits, axis=1, keepdims=True) dist = (1 - labels) * margin - (a_target - logits) dist = tf.pow(tf.maximum(0., dist), 2) loss = tf.reduce_mean(tf.reduce_sum(dist, axis=-1)) if regularizer is not None: regularizer = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) loss += tf.reduce_mean(regularizer) return(loss)
Example #8
Source File: losses.py From CapsLayer with Apache License 2.0 | 6 votes |
def margin_loss(labels, logits, upper_margin=0.9, bottom_margin=0.1, downweight=0.5): """ Args: labels: [batch_size, num_label]. logits: [batch_size, num_label]. """ positive_selctor = tf.cast(tf.less(logits, upper_margin), tf.float32) positive_cost = positive_selctor * labels * tf.pow(logits - upper_margin, 2) negative_selctor = tf.cast(tf.greater(logits, bottom_margin), tf.float32) negative_cost = negative_selctor * (1 - labels) * tf.pow(logits - bottom_margin, 2) loss = 0.5 * positive_cost + 0.5 * downweight * negative_cost return tf.reduce_mean(tf.reduce_sum(loss, axis=-1))
Example #9
Source File: loss.py From centernet_tensorflow_wilderface_voc with MIT License | 6 votes |
def focal_loss(pred, gt): ''' Modified focal loss. Exactly the same as CornerNet. Runs faster and costs a little bit more memory Arguments: pred (batch,h,w,c) gt_regr (batch,h,w,c) ''' pos_inds = tf.cast(tf.equal(gt,1.0),dtype=tf.float32) neg_inds = 1.0-pos_inds neg_weights = tf.pow(1.0 - gt, 4.0) pred=tf.clip_by_value(pred, 1e-6, 1.0 - 1e-6) pos_loss = tf.log(pred) * tf.pow(1.0 - pred, 2.0) * pos_inds neg_loss = tf.log(1.0 - pred) * tf.pow(pred, 2.0) * neg_weights * neg_inds num_pos = tf.reduce_sum(pos_inds) pos_loss = tf.reduce_sum(pos_loss) neg_loss = tf.reduce_sum(neg_loss) loss = - (pos_loss + neg_loss) / num_pos
Example #10
Source File: hmc.py From zhusuan with MIT License | 6 votes |
def tune(self, acceptance_rate, fresh_start): def adapt_stepsize(): new_step = tf.assign(self.step, (1 - fresh_start) * self.step + 1) rate1 = 1.0 / (new_step + self.t0) new_h_bar = tf.assign( self.h_bar, (1 - fresh_start) * (1 - rate1) * self.h_bar + rate1 * (self.delta - acceptance_rate)) log_epsilon = self.mu - tf.sqrt(new_step) / self.gamma * new_h_bar rate = tf.pow(new_step, -self.kappa) new_log_epsilon_bar = tf.assign( self.log_epsilon_bar, rate * log_epsilon + (1 - fresh_start) * (1 - rate) * self.log_epsilon_bar) with tf.control_dependencies([new_log_epsilon_bar]): new_log_epsilon = tf.identity(log_epsilon) return tf.exp(new_log_epsilon) c = tf.cond(self.adapt_step_size, adapt_stepsize, lambda: tf.exp(self.log_epsilon_bar)) return c
Example #11
Source File: hmc.py From zhusuan with MIT License | 6 votes |
def update(self, x): # x: (chain_dims data_dims) new_t = tf.assign(self.t, self.t + 1) weight = (1 - self.decay) / (1 - tf.pow(self.decay, new_t)) # incr: (chain_dims data_dims) incr = [weight * (q - mean) for q, mean in zip(x, self.mean)] # mean: (1,...,1 data_dims) update_mean = [mean.assign_add( tf.reduce_mean(i, axis=self.chain_axes, keepdims=True)) for mean, i in zip(self.mean, incr)] # var: (1,...,1 data_dims) new_var = [ (1 - weight) * var + tf.reduce_mean(i * (q - mean), axis=self.chain_axes, keepdims=True) for var, i, q, mean in zip(self.var, incr, x, update_mean)] update_var = [tf.assign(var, n_var) for var, n_var in zip(self.var, new_var)] return update_var
Example #12
Source File: layers.py From aboleth with Apache License 2.0 | 6 votes |
def __init__(self, n_features, lenscale=None, p=1, variational=False, learn_lenscale=False): """Create an instance of an arc cosine kernel layer.""" # Setup random weights if variational: kern = RBFVariational(lenscale=lenscale, learn_lenscale=learn_lenscale) else: kern = RBF(lenscale=lenscale, learn_lenscale=learn_lenscale) super().__init__(n_features=n_features, kernel=kern) # Kernel order assert isinstance(p, int) and p >= 0 if p == 0: self.pfunc = tf.sign elif p == 1: self.pfunc = lambda x: x else: self.pfunc = lambda x: tf.pow(x, p)
Example #13
Source File: layers.py From basenji with Apache License 2.0 | 6 votes |
def call(self, inputs): input_shape = tf.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] pos_range = tf.range(-seq_len//2, seq_len//2) if self.transform is None: pos_feature = pos_range elif self.transform == 'abs': pos_feature = tf.math.abs(pos_range) elif self.transform == 'reversed': pos_feature = pos_range[::-1] else: raise ValueError('Unknown ConcatPosition transform.') if self.power != 1: pos_feature = tf.pow(pos_feature, self.power) pos_feature = tf.expand_dims(pos_feature, axis=0) pos_feature = tf.expand_dims(pos_feature, axis=-1) pos_feature = tf.tile(pos_feature, [batch_size, 1, 1]) pos_feature = tf.dtypes.cast(pos_feature, dtype=tf.float32) return tf.concat([pos_feature, inputs], axis=-1)
Example #14
Source File: losses.py From ros_people_object_detection_tensorflow with Apache License 2.0 | 5 votes |
def _compute_loss(self, prediction_tensor, target_tensor, weights, class_indices=None): """Compute loss function. Args: prediction_tensor: A float tensor of shape [batch_size, num_anchors, num_classes] representing the predicted logits for each class target_tensor: A float tensor of shape [batch_size, num_anchors, num_classes] representing one-hot encoded classification targets weights: a float tensor of shape [batch_size, num_anchors] class_indices: (Optional) A 1-D integer tensor of class indices. If provided, computes loss only for the specified class indices. Returns: loss: a float tensor of shape [batch_size, num_anchors, num_classes] representing the value of the loss function. """ weights = tf.expand_dims(weights, 2) if class_indices is not None: weights *= tf.reshape( ops.indices_to_dense_vector(class_indices, tf.shape(prediction_tensor)[2]), [1, 1, -1]) per_entry_cross_ent = (tf.nn.sigmoid_cross_entropy_with_logits( labels=target_tensor, logits=prediction_tensor)) prediction_probabilities = tf.sigmoid(prediction_tensor) p_t = ((target_tensor * prediction_probabilities) + ((1 - target_tensor) * (1 - prediction_probabilities))) modulating_factor = 1.0 if self._gamma: modulating_factor = tf.pow(1.0 - p_t, self._gamma) alpha_weight_factor = 1.0 if self._alpha is not None: alpha_weight_factor = (target_tensor * self._alpha + (1 - target_tensor) * (1 - self._alpha)) focal_cross_entropy_loss = (modulating_factor * alpha_weight_factor * per_entry_cross_ent) return focal_cross_entropy_loss * weights
Example #15
Source File: network.py From SSH-TensorFlow with MIT License | 5 votes |
def _smooth_l1_loss(self, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, sigma=1.0, dim=[1]): sigma_2 = sigma ** 2 box_diff = bbox_pred - bbox_targets in_box_diff = bbox_inside_weights * box_diff abs_in_box_diff = tf.abs(in_box_diff) smoothL1_sign = tf.stop_gradient(tf.to_float(tf.less(abs_in_box_diff, 1. / sigma_2))) in_loss_box = tf.pow(in_box_diff, 2) * (sigma_2 / 2.) * smoothL1_sign \ + (abs_in_box_diff - (0.5 / sigma_2)) * (1. - smoothL1_sign) out_loss_box = bbox_outside_weights * in_loss_box loss_box = tf.reduce_mean(tf.reduce_sum( out_loss_box, axis=dim )) return loss_box
Example #16
Source File: prioritized_replay.py From rlgraph with Apache License 2.0 | 5 votes |
def _graph_fn_update_records(self, indices, update): num_records = get_batch_size(indices) max_priority = 0.0 # Update has to be sequential. def insert_body(i, max_priority_): priority = tf.pow(x=update[i], y=self.alpha) sum_insert = self.sum_segment_tree.insert( index=indices[i], element=priority, insert_op=tf.add ) min_insert = self.min_segment_tree.insert( index=indices[i], element=priority, insert_op=tf.minimum ) # Keep track of current max priority element. max_priority_ = tf.maximum(x=max_priority_, y=priority) with tf.control_dependencies(control_inputs=[tf.group(sum_insert, min_insert)]): # TODO: This confuses the auto-return value detector. return i + 1, max_priority_ def cond(i, max_priority_): return i < num_records - 1 _, max_priority = tf.while_loop( cond=cond, body=insert_body, loop_vars=(0, max_priority) ) assignment = self.assign_variable(ref=self.max_priority, value=max_priority) with tf.control_dependencies(control_inputs=[assignment]): return tf.no_op()
Example #17
Source File: inputs.py From cloudml-samples with Apache License 2.0 | 5 votes |
def process_features(features): """ Use to implement custom feature engineering logic. Default behaviour is to return the original feature tensors dictionary as-is. Args: features: {string:tensors} - dictionary of feature tensors Returns: {string:tensors}: extended feature tensors dictionary """ # examples - given: # 'x' and 'y' are two numeric features: # 'alpha' and 'beta' are two categorical features # # create new features using custom logic # features['x_2'] = tf.pow(features['x'],2) # features['y_2'] = tf.pow(features['y'], 2) # features['xy'] = features['x'] * features['y'] # features['sin_x'] = tf.sin(features['x']) # features['cos_y'] = tf.cos(features['x']) # features['log_xy'] = tf.log(features['xy']) # features['sqrt_xy'] = tf.sqrt(features['xy']) # # add created features to metadata (if not already defined in metadata.py) # NUMERIC_FEATURE_NAMES_WITH_STATS['x_2']: None # NUMERIC_FEATURE_NAMES_WITH_STATS['y_2']: None # .... return features # ****************************************************************************** # YOU NEED NOT TO CHANGE THIS FUNCTION TO READ DATA FILES # ******************************************************************************
Example #18
Source File: inputs.py From cloudml-samples with Apache License 2.0 | 5 votes |
def process_features(features): """ Use to implement custom feature engineering logic. Default behaviour is to return the original feature tensors dictionary as-is. Args: features: {string:tensors} - dictionary of feature tensors Returns: {string:tensors}: extended feature tensors dictionary """ # examples - given: # 'x' and 'y' are two numeric features: # 'alpha' and 'beta' are two categorical features # # create new features using custom logic # features['x_2'] = tf.pow(features['x'],2) # features['y_2'] = tf.pow(features['y'], 2) # features['xy'] = features['x'] * features['y'] # features['sin_x'] = tf.sin(features['x']) # features['cos_y'] = tf.cos(features['x']) # features['log_xy'] = tf.log(features['xy']) # features['sqrt_xy'] = tf.sqrt(features['xy']) # # add created features to metadata (if not already defined in metadata.py) # NUMERIC_FEATURE_NAMES_WITH_STATS['x_2']: None # NUMERIC_FEATURE_NAMES_WITH_STATS['y_2']: None # .... return features # ****************************************************************************** # YOU NEED NOT TO CHANGE THIS FUNCTION TO READ DATA FILES # ******************************************************************************
Example #19
Source File: optim.py From glow with MIT License | 5 votes |
def adam(params, cost_or_grads, alpha=3e-4, hps=None, epsilon=1e-8): updates = [] if type(cost_or_grads) is not list: gs = tf.gradients(cost_or_grads, params) else: gs = cost_or_grads beta2 = 1-1./(hps.train_its*hps.polyak_epochs) # all-reduce grads = [Z.allreduce_mean(g) for g in gs] t = tf.Variable(1., 'adam_t') alpha_t = alpha * tf.sqrt((1. - tf.pow(beta2, t))) / \ (1. - tf.pow(hps.beta1, t)) updates.append(t.assign_add(1)) for w, g in zip(params, grads): mom2 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m2') if hps.beta1 > 0: mom1 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m1') mom1_new = hps.beta1 * mom1 + (1. - hps.beta1) * g updates.append(mom1.assign(mom1_new)) else: mom1_new = g m2_new = beta2 * mom2 + (1. - beta2) * tf.square(g) delta_t = mom1_new / (tf.sqrt(m2_new) + epsilon) w_new = hps.weight_decay * w - alpha_t * delta_t updates.append(mom2.assign(m2_new)) updates.append(w.assign(w_new)) # Polyak averaging polyak_avg_op, polyak_swap_op, ema = polyak(params, beta2) train_op = tf.group(polyak_avg_op, *updates) return train_op, polyak_swap_op, ema
Example #20
Source File: siamese_net.py From atec-nlp with MIT License | 5 votes |
def contrastive_loss(self, y, e): # margin and pos_weight can directly influence P and R metrics. l_1 = self._contrastive_loss_pos_weight * tf.pow(1-e, 2) l_0 = tf.square(tf.maximum(e-self._margin, 0)) loss = tf.reduce_mean(y * l_1 + (1 - y) * l_0) return loss
Example #21
Source File: ops.py From HyperGAN with MIT License | 5 votes |
def gelu(self, x): return 0.5*x*(1+tf.nn.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x,3))))
Example #22
Source File: model.py From rgn with MIT License | 5 votes |
def _curriculum(config, step, loss_history, dependency_ops): """ Creates TF ops for maintaining and advancing the curriculum. """ # assign appropriate curriculum increment value for case in switch(config['behavior']): if case('fixed_rate'): # fixed rate, always return same number increment = tf.constant(config['rate'], name='curriculum_increment') elif case('loss_threshold'): # return fixed increment if last loss is below threshold, zero otherwise increment_pred = tf.less(loss_history[-1], config['threshold'], name='curriculum_predicate') full_increment_func = lambda: tf.constant(config['rate'], name='full_curriculum_increment') zero_increment_func = lambda: tf.constant(0.0, name='zero_curriculum_increment') increment = tf.cond(increment_pred, full_increment_func, zero_increment_func) elif case('loss_change'): # predicate for increment type increment_pred = tf.not_equal(loss_history[0], DUMMY_LOSS, name='curriculum_predicate') # increment function for when loss history is still def full_increment_func(): lin_seq = tf.expand_dims(tf.linspace(0., 1., config['change_num_iterations']), 1) ls_matrix = tf.concat([tf.ones_like(lin_seq), lin_seq], 1) ls_rhs = tf.expand_dims(loss_history, 1) ls_slope = tf.matrix_solve_ls(ls_matrix, ls_rhs)[1, 0] full_increment = tf.div(config['rate'], tf.pow(tf.abs(ls_slope) + 1, config['sharpness']), name='full_curriculum_increment') return full_increment # dummy increment function for when loss history is changing rapidly zero_increment_func = lambda: tf.constant(0.0, name='zero_curriculum_increment') # final conditional increment increment = tf.cond(increment_pred, full_increment_func, zero_increment_func) # create updating op. the semantics are such that training / gradient update is first performed before the curriculum is incremented. with tf.control_dependencies(dependency_ops): update_op = tf.assign_add(step, increment, name='update_curriculum_op') return update_op
Example #23
Source File: modeling.py From bert-for-tf2 with MIT License | 5 votes |
def gelu(x): """Gaussian Error Linear Unit. This is a smoother version of the RELU. Original paper: https://arxiv.org/abs/1606.08415 Args: x: float Tensor to perform activation. Returns: `x` with the GELU activation applied. """ cdf = 0.5 * (1.0 + tf.tanh( (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) return x * cdf
Example #24
Source File: dop853.py From astroNN with MIT License | 5 votes |
def hinit(func, x, t, pos_neg, f0, iord, hmax, rtol, atol, args): """ Estimate initial step size """ sk = atol + rtol * tf.abs(x) dnf = tf.reduce_sum(tf.square(f0 / sk), axis=0) dny = tf.reduce_sum(tf.square(x / sk), axis=0) h = tf.sqrt(dny / dnf) * 0.01 h = tf.reduce_min([h, tf.abs(hmax)]) h = custom_sign(h, pos_neg) # perform an explicit Euler step xx1 = x + h * f0 f1 = func(xx1, t[0] + h, *args) # estimate the second derivative of the solution der2 = tf.reduce_sum(tf.square((f1 - f0) / sk), axis=0) der2 = tf.sqrt(der2) / h # step size is computed such that h ** iord * max_d(norm(f0), norm(der2)) = 0.01 der12 = tf.reduce_max([tf.abs(der2), tf.sqrt(dnf)]) h1 = tf.pow(0.01 / der12, 1.0 / iord) h = tf.reduce_min([100.0 * tf.abs(h), tf.reduce_min([tf.abs(h1), tf.abs(hmax)])]) return custom_sign(h, pos_neg), f0, f1, xx1
Example #25
Source File: optim.py From glow with MIT License | 5 votes |
def adam2(params, cost_or_grads, alpha=3e-4, hps=None, epsilon=1e-8): updates = [] if type(cost_or_grads) is not list: gs = tf.gradients(cost_or_grads, params) else: gs = cost_or_grads beta2 = 1-1./(hps.train_its*hps.polyak_epochs) # all-reduce grads1 = [Z.allreduce_mean(g) for g in gs] grads2 = [Z.allreduce_mean(g**2) for g in gs] t = tf.Variable(1., 'adam_t') alpha_t = alpha * tf.sqrt((1. - tf.pow(beta2, t))) / \ (1. - tf.pow(hps.beta1, t)) updates.append(t.assign_add(1)) for w, g1, g2 in zip(params, grads1, grads2): mom2 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m2') if hps.beta1 > 0: mom1 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m1') mom1_new = hps.beta1 * mom1 + (1. - hps.beta1) * g1 updates.append(mom1.assign(mom1_new)) else: mom1_new = g1 m2_new = beta2 * mom2 + (1. - beta2) * g2 delta_t = mom1_new / (tf.sqrt(m2_new) + epsilon) w_new = hps.weight_decay * w - alpha_t * delta_t updates.append(mom2.assign(m2_new)) updates.append(w.assign(w_new)) # Polyak averaging polyak_avg_op, polyak_swap_op, ema = polyak(params, beta2) train_op = tf.group(polyak_avg_op, *updates) return train_op, polyak_swap_op, ema
Example #26
Source File: optim.py From glow with MIT License | 5 votes |
def adam2_old(params, cost_or_grads, lr=3e-4, mom1=0.9, mom2=0.999, epsilon=1e-8): updates = [] if type(cost_or_grads) is not list: gs = tf.gradients(cost_or_grads, params) else: gs = cost_or_grads # all-reduce grads1 = [Z.allreduce_mean(g) for g in gs] grads2 = [Z.allreduce_mean(tf.square(g)) for g in gs] mom2 = tf.maximum(0., 1. - (hvd.size() * (1 - mom2))) t = tf.Variable(1., 'adam_t') lr_t = lr * tf.sqrt((1. - tf.pow(mom2, t))) / (1. - tf.pow(mom1, t)) updates.append(t.assign_add(1)) for p, g1, g2 in zip(params, grads1, grads2): mg = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_mg') if mom1 > 0: v = tf.Variable(tf.zeros(p.get_shape()), p.name + '_adam_v') v_t = mom1 * v + (1. - mom1) * g1 updates.append(v.assign(v_t)) else: v_t = g1 mg_t = mom2 * mg + (1. - mom2) * g2 delta_t = v_t / (tf.sqrt(mg_t) + epsilon) p_t = p - lr_t * delta_t updates.append(mg.assign(mg_t)) updates.append(p.assign(p_t)) return tf.group(*updates)
Example #27
Source File: model.py From lm-human-preferences with MIT License | 5 votes |
def gelu(x): with tf.name_scope('gelu'): return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3))))
Example #28
Source File: optim.py From glow with MIT License | 5 votes |
def adamax(params, cost_or_grads, alpha=3e-4, hps=None, epsilon=1e-8): updates = [] if type(cost_or_grads) is not list: gs = tf.gradients(cost_or_grads, params) else: gs = cost_or_grads beta2 = 1-1./(hps.train_its*hps.polyak_epochs) # all-reduce grads = [Z.allreduce_mean(g) for g in gs] t = tf.Variable(1., 'adam_t') alpha_t = alpha * tf.sqrt((1. - tf.pow(beta2, t))) / \ (1. - tf.pow(hps.beta1, t)) updates.append(t.assign_add(1)) for w, g in zip(params, grads): mom2 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m2') if hps.beta1 > 0: mom1 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m1') mom1_new = hps.beta1 * mom1 + (1. - hps.beta1) * g updates.append(mom1.assign(mom1_new)) else: mom1_new = g m2_new = tf.maximum(beta2 * mom2, abs(g)) delta_t = mom1_new / (m2_new + epsilon) w_new = hps.weight_decay * w - alpha_t * delta_t updates.append(mom2.assign(m2_new)) updates.append(w.assign(w_new)) # Polyak averaging polyak_avg_op, polyak_swap_op, ema = polyak(params, beta2) train_op = tf.group(polyak_avg_op, *updates) return train_op, polyak_swap_op, ema
Example #29
Source File: optim.py From glow with MIT License | 5 votes |
def adam(params, cost_or_grads, alpha=3e-4, hps=None, epsilon=1e-8): updates = [] if type(cost_or_grads) is not list: gs = tf.gradients(cost_or_grads, params) else: gs = cost_or_grads beta2 = 1-1./(hps.train_its*hps.polyak_epochs) # all-reduce grads = [Z.allreduce_mean(g) for g in gs] t = tf.Variable(1., 'adam_t') alpha_t = alpha * tf.sqrt((1. - tf.pow(beta2, t))) / \ (1. - tf.pow(hps.beta1, t)) updates.append(t.assign_add(1)) for w, g in zip(params, grads): mom2 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m2') if hps.beta1 > 0: mom1 = tf.Variable(tf.zeros(w.get_shape()), w.name + '_adam_m1') mom1_new = hps.beta1 * mom1 + (1. - hps.beta1) * g updates.append(mom1.assign(mom1_new)) else: mom1_new = g m2_new = beta2 * mom2 + (1. - beta2) * tf.square(g) delta_t = mom1_new / (tf.sqrt(m2_new) + epsilon) w_new = hps.weight_decay * w - alpha_t * delta_t updates.append(mom2.assign(m2_new)) updates.append(w.assign(w_new)) # Polyak averaging polyak_avg_op, polyak_swap_op, ema = polyak(params, beta2) train_op = tf.group(polyak_avg_op, *updates) return train_op, polyak_swap_op, ema
Example #30
Source File: losses.py From MultiKE with MIT License | 5 votes |
def orthogonal_loss(mapping, eye): loss = tf.reduce_sum(tf.reduce_sum(tf.pow(tf.matmul(mapping, mapping, transpose_b=True) - eye, 2), 1)) return loss