Python tensorflow.clip_by_global_norm() Examples

The following are 30 code examples of tensorflow.clip_by_global_norm(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorflow , or try the search function .
Example #1
Source File: optimizer.py    From BERT with Apache License 2.0 6 votes vote down vote up
def grad_clip_fn(self, loss, tvars, **kargs):
		grads = tf.gradients(loss, tvars)
		grad_clip = self.config.get("grad_clip", "global_norm")
		tf.logging.info(" gradient clip method {}".format(grad_clip))
		if grad_clip == "global_norm":
			clip_norm = self.config.get("clip_norm", 1.0)
			[grads, _] = tf.clip_by_global_norm(grads, 
								clip_norm=clip_norm)
		elif grad_clip == "norm":
			clip_norm = self.config.get("clip_norm", 1.0)
			grads = [tf.clip_by_norm(grad, clip_norm) for grad in grads]
		elif grad_clip == "value":
			clip_min_value = self.config.get("clip_min_value", -1.0)
			clip_max_value = self.config.get("clip_max_value", 1.0)
			grads = [tf.clip_by_value(grad, clip_norm) for grad in grads]
		else:
			grads = grads
		return grads 
Example #2
Source File: dqn.py    From TransferRL with MIT License 6 votes vote down vote up
def _add_train_op(self):
        # In regression, the objective loss is Mean Squared Error (MSE).
        self.loss = tf.losses.mean_squared_error(labels = self._y, predictions = self.output)

        tvars = tf.trainable_variables()
        gradients = tf.gradients(self.loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)

        # Clip the gradients
        with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)):
            grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm)

        # Add a summary
        tf.summary.scalar('global_norm', global_norm)

        # Apply adagrad optimizer
        optimizer = tf.train.AdamOptimizer(self._hps.lr)
        with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)):
            self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')

        self.variable_summaries('dqn_loss',self.loss) 
Example #3
Source File: seq2seq_attention_model.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def _add_train_op(self):
    """Sets self._train_op, op to run for training."""
    hps = self._hps

    self._lr_rate = tf.maximum(
        hps.min_lr,  # min_lr_rate.
        tf.train.exponential_decay(hps.lr, self.global_step, 30000, 0.98))

    tvars = tf.trainable_variables()
    with tf.device(self._get_gpu(self._num_gpus-1)):
      grads, global_norm = tf.clip_by_global_norm(
          tf.gradients(self._loss, tvars), hps.max_grad_norm)
    tf.summary.scalar('global_norm', global_norm)
    optimizer = tf.train.GradientDescentOptimizer(self._lr_rate)
    tf.summary.scalar('learning rate', self._lr_rate)
    self._train_op = optimizer.apply_gradients(
        zip(grads, tvars), global_step=self.global_step, name='train_step') 
Example #4
Source File: agent.py    From ppo-lstm-parallel with MIT License 6 votes vote down vote up
def get_train_op(self, loss, clip_factor, clip, step):
        import tensorflow as tf
        optimizer = tf.train.AdamOptimizer(learning_rate=step)
        gradients, variables = zip(*optimizer.compute_gradients(loss))
        filtered_grads = []
        filtered_vars = []
        for i in range(len(gradients)):
            if gradients[i] is not None:
                filtered_grads.append(gradients[i])
                filtered_vars.append(variables[i])
        gradients = filtered_grads
        variables = filtered_vars
        if clip:
            gradients, _ = tf.clip_by_global_norm(gradients, clip_factor)
        grad_norm = tf.reduce_sum([tf.norm(grad) for grad in gradients])
        train_op = optimizer.apply_gradients(zip(gradients, variables))
        return optimizer, train_op, grad_norm 
Example #5
Source File: adem_graphs.py    From ADEM with MIT License 6 votes vote down vote up
def adem(context_vector, model_response_vector, reference_response_vector,
         context_dim, model_response_dim, reference_response_dim,
         human_score_place, lr, max_grad_norm):
    model_score, M, N = tf_dynamic_adem_score(
        context=context_vector,
        model_response=model_response_vector,
        reference_response=reference_response_vector,
        shape_info={'batch_size': None,
                    'ct_dim': context_dim,
                    'mr_dim': model_response_dim,
                    'rr_dim': reference_response_dim})

    loss = compute_adem_l1_loss(human_score_place, model_score, M, N)

    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(
        tf.gradients(loss, tvars), max_grad_norm)
    optimizer = tf.train.AdamOptimizer(lr)
    train_op = optimizer.apply_gradients(
        zip(grads, tvars),
        global_step=tf.contrib.framework.get_or_create_global_step()
    )
    return train_op, loss, model_score 
Example #6
Source File: train.py    From hierarchical-attention-networks with MIT License 6 votes vote down vote up
def train_fn(loss):
  trained_vars = tf.trainable_variables()
  count_parameters(trained_vars)

  # Gradient clipping
  gradients = tf.gradients(loss, trained_vars)

  clipped_grads, global_norm = tf.clip_by_global_norm(gradients, FLAGS.max_grad_norm)
  tf.summary.scalar('global_grad_norm', global_norm)

  # Add gradients and vars to summary
  # for gradient, var in list(zip(clipped_grads, trained_vars)):
  #   if 'attention' in var.name:
  #     tf.summary.histogram(var.name + '/gradient', gradient)
  #     tf.summary.histogram(var.name, var)

  # Define optimizer
  global_step = tf.train.get_or_create_global_step()
  optimizer = tf.train.RMSPropOptimizer(FLAGS.learning_rate)
  train_op = optimizer.apply_gradients(zip(clipped_grads, trained_vars),
                                       name='train_op',
                                       global_step=global_step)
  return train_op, global_step 
Example #7
Source File: policy_gradient.py    From EasyRL with Apache License 2.0 6 votes vote down vote up
def _build_train(self, loss, optimizer, vars=None, global_step=None):

        grads_and_vars = optimizer.compute_gradients(loss=loss, var_list=vars)
        grads_and_vars = [(grad, var) for grad, var in grads_and_vars
                          if grad is not None]

        # apply grad clipping
        grads, vars = zip(*grads_and_vars)
        clipped_grads, _ = tf.clip_by_global_norm(
            grads, clip_norm=self.config.get('global_norm_clip', 40))
        grads_and_vars = list(zip(clipped_grads, vars))

        train_op = optimizer.apply_gradients(
            grads_and_vars, global_step=global_step)

        return train_op 
Example #8
Source File: batch_dqn.py    From EasyRL with Apache License 2.0 6 votes vote down vote up
def _build_train(self, loss, optimizer, vars, global_step=None):
        """
        construct the operation for optimization.

        Arguments:
            loss: the object loss function to minimize
            optimizer: optimizer to implement the optimization
            vars: the available variables to optimize
            global_step: record to total number of optimization
        """

        # compute gradients
        grads_and_vars = optimizer.compute_gradients(loss=loss, var_list=vars)
        grads_and_vars = [(grad, var) for grad, var in grads_and_vars
                          if grad is not None]

        # apply grad clipping
        grads, vars = zip(*grads_and_vars)
        clipped_grads, _ = tf.clip_by_global_norm(
            grads, clip_norm=self.config.get('global_norm_clip', 40))
        grads_and_vars = list(zip(clipped_grads, vars))

        train_op = optimizer.apply_gradients(
            grads_and_vars, global_step=global_step)
        return train_op 
Example #9
Source File: tripletext2seq.py    From Zeroshot-QuestionGeneration with MIT License 6 votes vote down vote up
def __create_optimizer(self):
        print('creating optimizer...')
        start = time.time()

        learning_rate = tf.train.exponential_decay(self.config.LR, self.global_step, 200, 0.97, staircase=True)
        self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
        # self.opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)

        # normalize the gradients of a parameter vector when its L2 norm exceeds a certain threshold according to
        trainable_params = tf.trainable_variables()

        # calculate gradients of the loss given all the trainable parameters
        gradients = tf.gradients(self.loss, trainable_params)

        # Gradient clipping: new_gradients = gradients * threshold / l2_norm(gradients)
        clip_gradients, _ = tf.clip_by_global_norm(gradients, self.config.MAX_GRAD_NORM)

        self.updates = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step)

        print('Building optimizer in: ', time.time() - start, ' secs') 
Example #10
Source File: triples2seq.py    From Zeroshot-QuestionGeneration with MIT License 6 votes vote down vote up
def __create_optimizer(self):
        print('creating optimizer...')
        start = time.time()

        learning_rate = tf.train.exponential_decay(self.config.LR, self.global_step, 200, 0.97, staircase=True)
        self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate)

        # learning_rate = tf.train.exponential_decay(self.config.LR, self.global_step, 100, 0.96, staircase=True)

        # self.opt = tf.train.RMSPropOptimizer(learning_rate=learning_rate)
        # self.opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)

        # normalize the gradients of a parameter vector when its L2 norm exceeds a certain threshold according to
        trainable_params = tf.trainable_variables()

        # calculate gradients of the loss given all the trainable parameters
        gradients = tf.gradients(self.loss, trainable_params)

        # Gradient clipping: new_gradients = gradients * threshold / l2_norm(gradients)
        clip_gradients, _ = tf.clip_by_global_norm(gradients, self.config.MAX_GRAD_NORM)

        self.updates = self.opt.apply_gradients(zip(clip_gradients, trainable_params), global_step=self.global_step)

        print('Building optimizer in: ', time.time() - start, ' secs') 
Example #11
Source File: model_updater.py    From nematus with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def _define_apply_ops(self):
        """Defines the graph nodes for applying the accumulated gradients."""

        final_loss = self._accumulated_loss

        final_grad_vars = [(self._accumulated_gradients[key],
                            self._trainables[key])
                           for key in self._trainables.keys()]

        if self._config.clip_c > 0.0:
            grads, varss = list(zip(*final_grad_vars))
            clipped_grads, global_norm = tf.clip_by_global_norm(
                grads, clip_norm=self._config.clip_c)
            # Might be interesting to see how the global norm changes over
            # time, attach a summary?
            final_grad_vars = list(zip(clipped_grads, varss))

        apply_grads = self._optimizer.apply_gradients(
            final_grad_vars,
            global_step=self._global_step)

        self._apply_ops = [self._global_step, apply_grads, final_loss] 
Example #12
Source File: tacotron.py    From vae_tacotron with MIT License 6 votes vote down vote up
def add_optimizer(self, global_step):
    '''Adds optimizer. Sets "gradients" and "optimize" fields. add_loss must have been called.

    Args:
      global_step: int32 scalar Tensor representing current global step in training
    '''
    with tf.variable_scope('optimizer') as scope:
      hp = self._hparams
      if hp.decay_learning_rate:
        self.learning_rate = _learning_rate_decay(hp.initial_learning_rate, global_step)
      else:
        self.learning_rate = tf.convert_to_tensor(hp.initial_learning_rate)
      optimizer = tf.train.AdamOptimizer(self.learning_rate, hp.adam_beta1, hp.adam_beta2)
      gradients, variables = zip(*optimizer.compute_gradients(self.loss))
      self.gradients = gradients
      clipped_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)

      # Add dependency on UPDATE_OPS; otherwise batchnorm won't work correctly. See:
      # https://github.com/tensorflow/tensorflow/issues/1122
      with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        self.optimize = optimizer.apply_gradients(zip(clipped_gradients, variables),
          global_step=global_step) 
Example #13
Source File: dqn.py    From RLSeq2Seq with MIT License 6 votes vote down vote up
def _add_train_op(self):
        # In regression, the objective loss is Mean Squared Error (MSE).
        self.loss = tf.losses.mean_squared_error(labels = self._y, predictions = self.output)

        tvars = tf.trainable_variables()
        gradients = tf.gradients(self.loss, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)

        # Clip the gradients
        with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)):
            grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm)

        # Add a summary
        tf.summary.scalar('global_norm', global_norm)

        # Apply adagrad optimizer
        optimizer = tf.train.AdamOptimizer(self._hps.lr)
        with tf.device("/gpu:{}".format(self._hps.dqn_gpu_num)):
            self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step')

        self.variable_summaries('dqn_loss',self.loss) 
Example #14
Source File: hvd_distributed_optimizer.py    From BERT with Apache License 2.0 6 votes vote down vote up
def grad_clip_fn(self, opt, loss, tvars, **kargs):
		grads_and_vars = opt.compute_gradients(loss, tvars)
		grads = [grad for grad, _ in grads_and_vars]
		grad_clip = self.config.get("grad_clip", "global_norm")
		tf.logging.info(" gradient clip method {}".format(grad_clip))
		if grad_clip == "global_norm":
			clip_norm = self.config.get("clip_norm", 1.0)
			[grads, _] = tf.clip_by_global_norm(grads, 
								clip_norm=clip_norm)
		elif grad_clip == "norm":
			clip_norm = self.config.get("clip_norm", 1.0)
			grads = [tf.clip_by_norm(grad, clip_norm) for grad in grads]
		elif grad_clip == "value":
			clip_min_value = self.config.get("clip_min_value", -1.0)
			clip_max_value = self.config.get("clip_max_value", 1.0)
			grads = [tf.clip_by_value(grad, clip_norm) for grad in grads]
		else:
			grads = grads
		return grads 
Example #15
Source File: agent.py    From async-deeprl with MIT License 5 votes vote down vote up
def __init__(self, session, action_size, h, w, channels, opt=tf.train.AdamOptimizer(1e-4)):
        """Creates Q-Learning agent
        :param session: tensorflow session
        :param action_size: (int) length of action space
        :param h: (int) input image height
        :param w: (int) input image width
        :param channels: (int) number of image channels
        :param opt: tensorflow optimizer (by default: Adam optimizer)"""
        self.action_size = action_size
        self.opt = opt
        self.global_step = tf.Variable(0, name='frame', trainable=False)
        self.frame_inc_op = self.global_step.assign_add(1, use_locking=True)
        K.set_session(session)
        self.sess = session
        with tf.variable_scope('network'):
            self.action = tf.placeholder('int32', [None], name='action')
            self.reward = tf.placeholder('float32', [None], name='reward')
            model, self.state, self.q_values = self._build_model(h, w, channels)
            self.weights = model.trainable_weights
        with tf.variable_scope('optimizer'):
            # Zero all actions, except one that was performed
            action_onehot = tf.one_hot(self.action, self.action_size, 1.0, 0.0)
            # Predict expected future reward for performed action
            q_value = tf.reduce_sum(tf.multiply(self.q_values, action_onehot), reduction_indices=1)
            # Define squared mean loss function: (y - y_)^2
            self.loss = tf.reduce_mean(tf.square(self.reward - q_value))
            # Compute gradients w.r.t. weights
            grads = tf.gradients(self.loss, self.weights)
            # Apply gradient norm clipping
            grads, _ = tf.clip_by_global_norm(grads, 40.)
            grads_vars = list(zip(grads, self.weights))
            self.train_op = opt.apply_gradients(grads_vars)
        with tf.variable_scope('target_network'):
            target_m, self.target_state, self.target_q_values = self._build_model(h, w, channels)
            target_w = target_m.trainable_weights
        with tf.variable_scope('target_update'):
            self.target_update = [target_w[i].assign(self.weights[i])
                                  for i in range(len(target_w))] 
Example #16
Source File: kfac.py    From stable-baselines with MIT License 5 votes vote down vote up
def apply_gradients(self, grads):
        """
        apply the gradient

        :param grads: ([TensorFlow Tensor]) the gradient
        :return: (function, QueueRunner) train operation, queue operation runner
        """
        cold_optim = tf.train.MomentumOptimizer(self._cold_lr, self._momentum)

        def _cold_sgd_start():
            sgd_grads, sgd_var = zip(*grads)

            if self.max_grad_norm is not None:
                sgd_grads, _ = tf.clip_by_global_norm(sgd_grads, self.max_grad_norm)

            sgd_grads = list(zip(sgd_grads, sgd_var))

            sgd_step_op = tf.assign_add(self.sgd_step, 1)
            cold_optim_op = cold_optim.apply_gradients(sgd_grads)
            if KFAC_DEBUG:
                with tf.control_dependencies([sgd_step_op, cold_optim_op]):
                    sgd_step_op = tf.Print(
                        sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
            return tf.group(*[sgd_step_op, cold_optim_op])

        # remove unused variables
        grads = [(grad, var) for (grad, var) in grads if grad is not None]

        kfac_optim_op, queue_runner = self.apply_gradients_kfac(grads)

        def _warm_kfac_start():
            return kfac_optim_op

        return tf.cond(tf.greater(self.sgd_step, self._cold_iter), _warm_kfac_start, _cold_sgd_start), queue_runner 
Example #17
Source File: kfac.py    From rl_graph_generation with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def apply_gradients(self, grads):
        coldOptim = tf.train.MomentumOptimizer(
            self._cold_lr, self._momentum)

        def coldSGDstart():
            sgd_grads, sgd_var = zip(*grads)

            if self.max_grad_norm != None:
                sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)

            sgd_grads = list(zip(sgd_grads,sgd_var))

            sgd_step_op = tf.assign_add(self.sgd_step, 1)
            coldOptim_op = coldOptim.apply_gradients(sgd_grads)
            if KFAC_DEBUG:
                with tf.control_dependencies([sgd_step_op, coldOptim_op]):
                    sgd_step_op = tf.Print(
                        sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
            return tf.group(*[sgd_step_op, coldOptim_op])

        kfacOptim_op, qr = self.apply_gradients_kfac(grads)

        def warmKFACstart():
            return kfacOptim_op

        return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr 
Example #18
Source File: optimization_utils.py    From nucleus7 with Mozilla Public License 2.0 5 votes vote down vote up
def clip_grads_and_vars(grads_and_vars: _GRAD_AND_VARS_TYPE,
                        gradient_clip: float,
                        gradient_l2_norm: Optional[tf.Tensor] = None
                        ) -> _GRAD_AND_VARS_TYPE:
    """
    Clip all the gradients according to global normal with gradient_clip

    Parameters
    ----------
    grads_and_vars
        list of (gradient, variable)
    gradient_clip
        value to clip
    gradient_l2_norm
        gradient l2 norm used for the gradient clipping

    Returns
    -------
    grads_and_vars
        list of (clipped gradient, variable)
    """
    grads, variables = zip(*grads_and_vars)
    if gradient_l2_norm is None:
        grads_clipped, gradient_l2_norm = tf.clip_by_global_norm(
            grads, gradient_clip)
    else:
        grads_clipped = [each_grad * gradient_clip
                         / tf.maximum(gradient_l2_norm, gradient_clip)
                         for each_grad in grads]
    grads_and_vars_clipped = list(zip(grads_clipped, variables))
    return grads_and_vars_clipped 
Example #19
Source File: policy_value_network_tf2.py    From cchess-zero with MIT License 5 votes vote down vote up
def train_step(self, positions, pi, z, learning_rate=0):
        # Record the operations used to compute the loss, so that the gradient
        # of the loss with respect to the variables can be computed.
        #         metrics = 0

        with tf.GradientTape() as tape:
            policy_head, value_head = self.model(positions, training=True)
            loss = self.compute_loss(pi, z, policy_head, value_head)
            # self.ComputeMetrics(y, logits)
            metrics = self.compute_metrics(pi, policy_head)
        grads = tape.gradient(loss, self.model.trainable_variables)

        # grads = self.average_gradients(tower_grads)
        # grads = self.optimizer.compute_gradients(self.loss)
        # defensive step 2 to clip norm
        # grads0_lst = tf.map_fn(lambda x: x[0], grads)  # [g for g, _ in grads]
        clipped_grads, self.norm = tf.clip_by_global_norm(grads, self.global_norm)

        # defensive step 3 check NaN
        # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
        grad_check = [tf.debugging.check_numerics(g, message='NaN Found!') for g in clipped_grads]
        with tf.control_dependencies(grad_check):
            self.optimizer.apply_gradients(
                zip(clipped_grads, self.model.trainable_variables),  # [v for _, v in grads]
                global_step=self.global_step, name='train_step')

        if self.is_logging:
            for grad, var in zip(grads, self.model.trainable_variables):
                if grad is not None:
                    summary_ops_v2.histogram(var.name + '/gradients', grad)
            for var in self.model.trainable_variables:
                summary_ops_v2.histogram(var.name, var)

        return metrics, loss, self.global_step

    #@profile 
Example #20
Source File: ppo.py    From fine-lm with MIT License 5 votes vote down vote up
def define_ppo_step(data_points, optimizer, hparams):
  """Define ppo step."""
  observation, action, discounted_reward, norm_advantage, old_pdf = data_points
  new_policy_dist, new_value, _ = get_policy(observation, hparams)
  new_pdf = new_policy_dist.prob(action)

  ratio = new_pdf / old_pdf
  clipped_ratio = tf.clip_by_value(ratio, 1 - hparams.clipping_coef,
                                   1 + hparams.clipping_coef)

  surrogate_objective = tf.minimum(clipped_ratio * norm_advantage,
                                   ratio * norm_advantage)
  policy_loss = -tf.reduce_mean(surrogate_objective)

  value_error = new_value - discounted_reward
  value_loss = hparams.value_loss_coef * tf.reduce_mean(value_error ** 2)

  entropy = new_policy_dist.entropy()
  entropy_loss = -hparams.entropy_loss_coef * tf.reduce_mean(entropy)

  losses = [policy_loss, value_loss, entropy_loss]

  gradients = [list(zip(*optimizer.compute_gradients(loss)))
               for loss in losses]

  gradients_norms = [tf.global_norm(gradient[0]) for gradient in gradients]

  gradients_flat = sum([gradient[0] for gradient in gradients], ())
  gradients_variables_flat = sum([gradient[1] for gradient in gradients], ())

  if hparams.max_gradients_norm:
    gradients_flat, _ = tf.clip_by_global_norm(gradients_flat,
                                               hparams.max_gradients_norm)

  optimize_op = optimizer.apply_gradients(zip(gradients_flat,
                                              gradients_variables_flat))

  with tf.control_dependencies([optimize_op]):
    return [tf.identity(x) for x in losses + gradients_norms] 
Example #21
Source File: model.py    From cs294-112_hws with MIT License 5 votes vote down vote up
def __init__(self, FLAGS, algorithm, expert_returns=None, expert_policy_fn=None):
        print('Initializing the model...')
        if not algorithm.strip().lower() in ['behavioral_cloning', 'dagger']:
            raise NotImplementedError('Algorithm {} not implemented.'.format(algorithm))
        self.FLAGS = FLAGS
        self.algorithm = algorithm.strip().lower()
        self.expert_returns = expert_returns
        self.expert_policy_fn = expert_policy_fn
        if self.algorithm == 'dagger' and self.expert_policy_fn is None:
            raise ValueError('No expert policy found.')
        
        self.scope = self.algorithm + '_' + time.strftime('%Y-%m-%d-%H-%M-%S')
        
        with tf.variable_scope(
            self.scope, 
            initializer=tf.keras.initializers.he_normal(), 
            regularizer=tf.contrib.layers.l2_regularizer(scale=3e-7), 
            reuse=tf.AUTO_REUSE
        ):
            self.add_placeholders()
            self.build_graph()
            self.add_loss()
            
        params = tf.trainable_variables()
        gradients = tf.gradients(self.loss, params)
        self.gradient_norm = tf.global_norm(gradients)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, self.FLAGS['max_gradient_norm'])
        self.param_norm = tf.global_norm(params)
        
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        lr = self.FLAGS['learning_rate']
        opt = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.8, beta2=0.999, epsilon=1e-7)
        self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)
        
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        self.bestmodel_saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)
        self.summaries = tf.summary.merge_all() 
Example #22
Source File: base_model.py    From PCNN with Apache License 2.0 5 votes vote down vote up
def add_train_op(self, lr_method, lr, loss, clip=-1):
        """Defines self.train_op that performs an update on a batch

        Args:
            lr_method: (string) sgd method, for example "adam"
            lr: (tf.placeholder) tf.float32, learning rate
            loss: (tensor) tf.float32 loss to minimize
            clip: (python float) clipping of gradient. If < 0, no clipping

        """
        _lr_m = lr_method.lower() # lower to make sure

        with tf.variable_scope("train_step"):
            if _lr_m == 'adam': # sgd method
                optimizer = tf.train.AdamOptimizer(lr)
            elif _lr_m == 'adagrad':
                optimizer = tf.train.AdagradOptimizer(lr)
            elif _lr_m == 'sgd':
                optimizer = tf.train.GradientDescentOptimizer(lr)
            elif _lr_m == 'rmsprop':
                optimizer = tf.train.RMSPropOptimizer(lr)
            elif _lr_m == 'adadelta':
                optimizer = tf.train.AdadeltaOptimizer(lr)
            else:
                raise NotImplementedError("Unknown method {}".format(_lr_m))

            if clip > 0: # gradient clipping if clip is positive
                grads, vs     = zip(*optimizer.compute_gradients(loss))
                grads, gnorm  = tf.clip_by_global_norm(grads, clip)
                self.train_op = optimizer.apply_gradients(zip(grads, vs))
            else:
                self.train_op = optimizer.minimize(loss) 
Example #23
Source File: model.py    From Python-Deep-Learning-SE with MIT License 5 votes vote down vote up
def init_train_op(self, optimizer):
        # Flatten the targets to be compatible with the flattened logits
        targets_flat = tf.reshape(self.targets, (-1,))
        # Get the loss over all outputs
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.logits_flat, labels=targets_flat, name='x_entropy')
        self.loss = tf.reduce_mean(loss)
        trainable_variables = tf.trainable_variables()
        gradients = tf.gradients(loss, trainable_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 5)
        self.train_op = optimizer.apply_gradients(zip(gradients, trainable_variables)) 
Example #24
Source File: bert_cnn_model.py    From BERT with Apache License 2.0 5 votes vote down vote up
def train_lm(self):
        """based on the loss, use SGD to update parameter"""
        learning_rate = tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay_rate, staircase=True)
        self.learning_rate_=learning_rate
        #noise_std_dev = tf.constant(0.3) / (tf.sqrt(tf.cast(tf.constant(1) + self.global_step, tf.float32))) #gradient_noise_scale=noise_std_dev

        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients, variables = zip(*optimizer.compute_gradients(self.loss_val_lm))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) #ADD 2018.06.01
        with tf.control_dependencies(update_ops):  #ADD 2018.06.01
            train_op = optimizer.apply_gradients(zip(gradients, variables))
        #train_op = tf_contrib.layers.optimize_loss(self.loss_val, global_step=self.global_step,learning_rate=learning_rate, optimizer="Adam",clip_gradients=self.clip_gradients)
        return train_op 
Example #25
Source File: kfac.py    From lirpg with MIT License 5 votes vote down vote up
def apply_gradients(self, grads):
        coldOptim = tf.train.MomentumOptimizer(
            self._cold_lr, self._momentum)

        def coldSGDstart():
            sgd_grads, sgd_var = zip(*grads)

            if self.max_grad_norm != None:
                sgd_grads, sgd_grad_norm = tf.clip_by_global_norm(sgd_grads,self.max_grad_norm)

            sgd_grads = list(zip(sgd_grads,sgd_var))

            sgd_step_op = tf.assign_add(self.sgd_step, 1)
            coldOptim_op = coldOptim.apply_gradients(sgd_grads)
            if KFAC_DEBUG:
                with tf.control_dependencies([sgd_step_op, coldOptim_op]):
                    sgd_step_op = tf.Print(
                        sgd_step_op, [self.sgd_step, tf.convert_to_tensor('doing cold sgd step')])
            return tf.group(*[sgd_step_op, coldOptim_op])

        kfacOptim_op, qr = self.apply_gradients_kfac(grads)

        def warmKFACstart():
            return kfacOptim_op

        return tf.cond(tf.greater(self.sgd_step, self._cold_iter), warmKFACstart, coldSGDstart), qr 
Example #26
Source File: objective.py    From DOTA_models with Apache License 2.0 5 votes vote down vote up
def training_ops(self, loss, learning_rate=None):
    """Gradient ops."""
    opt = self.get_optimizer(learning_rate)
    params = tf.trainable_variables()
    grads = tf.gradients(loss, params)

    if self.clip_norm:
      grads, global_norm = tf.clip_by_global_norm(grads, self.clip_norm)
      tf.summary.scalar('grad_global_norm', global_norm)

    return opt.apply_gradients(zip(grads, params)) 
Example #27
Source File: model.py    From RLSeq2Seq with MIT License 5 votes vote down vote up
def _add_shared_train_op(self):
    """Sets self._train_op, the op to run for training."""
    # Take gradients of the trainable variables w.r.t. the loss function to minimize
    if self._hps.rl_training or self._hps.ac_training:
      loss_to_minimize = self._reinforce_shared_loss
      if self._hps.coverage:
        loss_to_minimize = self._reinforce_cov_total_loss
    else:
      loss_to_minimize = self._pgen_loss
      if self._hps.coverage:
        loss_to_minimize = self._pointer_cov_total_loss

    tvars = tf.trainable_variables()
    gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)

    # Clip the gradients
    with tf.device("/gpu:{}".format(self._hps.gpu_num)):
      grads, global_norm = tf.clip_by_global_norm(gradients, self._hps.max_grad_norm)

    # Add a summary
    tf.summary.scalar('global_norm', global_norm)

    # Apply adagrad optimizer
    optimizer = tf.train.AdagradOptimizer(self._hps.lr, initial_accumulator_value=self._hps.adagrad_init_acc)
    with tf.device("/gpu:{}".format(self._hps.gpu_num)):
      self._shared_train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name='train_step') 
Example #28
Source File: language_model.py    From lm with MIT License 5 votes vote down vote up
def _backward(self, loss, summaries=False):
        hps = self.hps

        loss = loss * hps.num_steps

        emb_vars = find_trainable_variables("emb")
        lstm_vars = find_trainable_variables("LSTM")
        softmax_vars = find_trainable_variables("softmax")

        all_vars = emb_vars + lstm_vars + softmax_vars
        grads = tf.gradients(loss, all_vars)
        orig_grads = grads[:]
        emb_grads = grads[:len(emb_vars)]
        grads = grads[len(emb_vars):]
        for i in range(len(emb_grads)):
            assert isinstance(emb_grads[i], tf.IndexedSlices)
            emb_grads[i] = tf.IndexedSlices(emb_grads[i].values * hps.batch_size, emb_grads[i].indices,
                                            emb_grads[i].dense_shape)

        lstm_grads = grads[:len(lstm_vars)]
        softmax_grads = grads[len(lstm_vars):]

        lstm_grads, lstm_norm = tf.clip_by_global_norm(lstm_grads, hps.max_grad_norm)
        clipped_grads = emb_grads + lstm_grads + softmax_grads
        assert len(clipped_grads) == len(orig_grads)

        if summaries:
            tf.scalar_summary("model/lstm_grad_norm", lstm_norm)
            tf.scalar_summary("model/lstm_grad_scale", tf.minimum(hps.max_grad_norm / lstm_norm, 1.0))
            tf.scalar_summary("model/lstm_weight_norm", tf.global_norm(lstm_vars))
            # for v, g, cg in zip(all_vars, orig_grads, clipped_grads):
            #     name = v.name.lstrip("model/")
            #     tf.histogram_summary(name + "/var", v)
            #     tf.histogram_summary(name + "/grad", g)
            #     tf.histogram_summary(name + "/clipped_grad", cg)

        return list(zip(clipped_grads, all_vars)) 
Example #29
Source File: marwil.py    From EasyRL with Apache License 2.0 5 votes vote down vote up
def _build_train(self, loss, optimizer, vars=None, global_step=None):
        grads_and_vars = optimizer.compute_gradients(loss=loss, var_list=vars)
        grads_and_vars = [(grad, var) for grad, var in grads_and_vars
                          if grad is not None]

        # apply grad clipping
        grads, vars = zip(*grads_and_vars)
        clipped_grads, _ = tf.clip_by_global_norm(
            grads, clip_norm=self.config.get('global_norm_clip', 40))
        grads_and_vars = list(zip(clipped_grads, vars))

        train_op = optimizer.apply_gradients(
            grads_and_vars, global_step=global_step)

        return train_op 
Example #30
Source File: model.py    From rgn with MIT License 5 votes vote down vote up
def _training(config, loss):
    """ Creates loss optimizer and returns minimization op. """

    # helper function
    optimizer_args = lambda o: o.__init__.__code__.co_varnames[:o.__init__.__code__.co_argcount]

    # select appropriate optimization function and construct arg list based on config
    optimizer_func = {'steepest': tf.train.GradientDescentOptimizer, # doesn't support momentum, unlike autograd
                      'rmsprop': tf.train.RMSPropOptimizer, 
                      'adam': tf.train.AdamOptimizer, 
                      'momentum': tf.train.MomentumOptimizer,
                      'adagrad': tf.train.AdagradOptimizer,
                      'adadelta': tf.train.AdadeltaOptimizer}[config['optimizer']]
    optimizer_params = config.viewkeys() & set(optimizer_args(optimizer_func))
    optimizer_params_and_values = {param: config[param] for param in optimizer_params}
    optimizer = optimizer_func(**optimizer_params_and_values)

    # obtain and process gradients
    grads_and_vars = optimizer.compute_gradients(loss)
    threshold = config['gradient_threshold']

    if threshold != float('inf'):
        for case in switch(config['rescale_behavior']):
            if case('norm_rescaling'):
                grads, _ = tf.clip_by_global_norm([g for g, _ in grads_and_vars], threshold)
                vars_ = [v for _, v in grads_and_vars]
                grads_and_vars = zip(grads, vars_)
            elif case('hard_clipping'):
                grads_and_vars = [(tf.clip_by_value(g, -threshold, threshold), v) for g, v in grads_and_vars]

    # apply gradients and return stepping op
    global_step = tf.get_variable(initializer=tf.constant_initializer(0), shape=[], trainable=False, dtype=tf.int32, name='global_step')
    minimize_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

    # dict useful for diagnostics
    grads_and_vars_dict = {}
    grads_and_vars_dict.update({('g' + str(i)): g for i, (g, _) in enumerate(grads_and_vars)})
    grads_and_vars_dict.update({('v' + str(i)): v for i, (_, v) in enumerate(grads_and_vars)})

    return global_step, minimize_op, grads_and_vars_dict