python source code of yellowfin

"""
YellowFin optimizer.

YellowFin and the Art of Momentum Tuning
https://arxiv.org/abs/1706.03471
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import tensorflow as tf
from tensorflow.python.framework import ops

# EPS for numerical stability
EPS = 1e-6
LARGE_FLOAT_VAL = 1e15

class YFOptimizer(object):
  """
  Optimizer that implements the YellowFin algorithm.

  Implemented as a wrapper around tf.train.MomentumOptimizer
  """
  # Available gate_gradients values
  GATE_NONE = tf.train.Optimizer.GATE_NONE
  GATE_OP = tf.train.Optimizer.GATE_OP
  GATE_GRAPH = tf.train.Optimizer.GATE_GRAPH

  def __init__(self, learning_rate=0.0001, momentum=0.0, clip_thresh=None,
               beta=0.999, curv_win_width=20, zero_debias=True, delta_mu=0.0,
               sparsity_debias=False, use_locking=False, name="YellowFin",
               use_nesterov=False, use_unsmoothed_lr_mu=True,
               h_max_log_smooth=True, h_min_log_smooth=True,
               use_adapt_grad_clip=True, stat_protect_fac=100.0):
    """
    Construct a new YellowFin optimizer.

    Args:
      learning rate: Python scalar. The initial value of learning rate,
        we use 1.0 in our paper.
      momentum: Python scalar. The initial value of momentum, we use
        0.0 in our paper.
      clip_thresh: Python scalar. The cliping threshold for
        `tf.clip_by_global_norm`. If None, no clipping will be used.
      beta: Python scalar. The smoothing parameter for estimations.
      curv_win_width: TODO
      zero_debias: TODO
      delta_mu: for extensions. Not necessary in the basic use.
      sparsity_debias: Python boolean. Gradient norm and curvature are
        biased to larger values when calculated with sparse gradient.
        This is useful when the model is very sparse, e.g. LSTM with
        word embedding. For non-sparse CNN, turning it off could
        slightly accelerate the speed.
      use_locking: If True, use locks for update operations.
      name: Optional name prefix for the operations created when
        applying gradients. Defaults to "YellowFin".
      use_nesterov: If True, the underlying MomentumOptimizer uses Nesterov
        Momentum. Set to False in the default YellowFin algorithm.

    Notes:
      `clip_thresh` is the threshold value on ||lr * gradient||
      `delta_mu` can be a placeholder/variable/python scalar. Used for
      additional momentum in situations such as asynchronous-parallel
      training. The default is 0.0 for basic usage of the optimizer.

    Other features:
      If you want to manually control the learning rates,
      `self.lr_factor` is an interface to the outside. It is a
      multiplier for the internal learning rate in YellowFin. It is
      helpful when you want to do additional hand tuning or some
      decaying scheme for the internal learning rate. Example on using
      `lr_factor` can be found here:
      https://github.com/JianGoForIt/YellowFin/blob/master/char-rnn-tensorflow/train_YF.py#L140
    """
    self._lr = learning_rate
    self._mu = momentum

    self._lr_var = tf.Variable(
      learning_rate, dtype=tf.float32, name="YF_lr", trainable=False)
    self._mu_var = tf.Variable(
      momentum, dtype=tf.float32, name="YF_mu", trainable=False)
    # for step scheme or decaying scheme for the learning rates
    self.lr_factor = tf.Variable(
      1.0, dtype=tf.float32, name="YF_lr_factor", trainable=False)
    if clip_thresh is not None:
      self._clip_thresh_var = tf.Variable(
        clip_thresh, dtype=tf.float32, name="YF_clip_thresh",
        trainable=False)
    else:
      self._clip_thresh_var = None

    # the underlying momentum optimizer
    self._optimizer = tf.train.MomentumOptimizer(
      self._lr_var * self.lr_factor, self._mu_var + delta_mu,
      use_locking, name, use_nesterov)

    # moving average for statistics
    self._beta = beta
    self._moving_averager = None

    # for global step counting
    self._global_step = tf.Variable(0, trainable=False)

    self._do_tune = tf.greater(self._global_step, tf.constant(0) )

    self._zero_debias = zero_debias
    self._sparsity_debias = sparsity_debias

    self._tvars = None

    # for curvature range
    self._curv_win_width = curv_win_width
    self._curv_win = None

    # option for using smoothed or unsmoothed lr and mu
    self._use_unsmoothed_lr_mu = use_unsmoothed_lr_mu

    # options for curvature envelop smoothing
    self._h_max_log_smooth = h_max_log_smooth
    self._h_min_log_smooth = h_min_log_smooth

    # for adaptive gradient clipping
    self._use_adapt_grad_clip = use_adapt_grad_clip
    self._adapt_grad_clip_thresh = \
      tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)
    self._adapt_grad_clip_target_val = \
      tf.Variable(LARGE_FLOAT_VAL, dtype=tf.float32, trainable=False)

    # prevent exploding gradient from ruining the statistics
    self._stat_protect_fac = stat_protect_fac

  def curvature_range(self):
    # set up the curvature window
    self._curv_win = tf.Variable(
      np.zeros([self._curv_win_width, ]), dtype=tf.float32,
      name="curv_win", trainable=False)
    # we can use log smoothing for curvature range to follow trend faster
    # self._curv_win = tf.scatter_update(
    #   self._curv_win, self._global_step % self._curv_win_width,
    #   tf.log(self._grad_norm_squared + EPS))
    self._curv_win = tf.scatter_update(
      self._curv_win, self._global_step % self._curv_win_width,
      self._grad_norm_squared + EPS)
    # note here the iterations start from iteration 0
    valid_window = tf.slice(
      self._curv_win, tf.constant([0, ]), tf.expand_dims(
        tf.minimum(tf.constant(self._curv_win_width),
                   self._global_step + 1), dim=0))

    if self._h_min_log_smooth:
      self._h_min_t = tf.log(tf.reduce_min(valid_window) + EPS)
    else:
      self._h_min_t = tf.reduce_min(valid_window)
    if self._h_max_log_smooth:
      self._h_max_t = tf.log(tf.reduce_max(valid_window) + EPS)
    else:
      self._h_max_t = tf.reduce_max(valid_window)

    curv_range_ops = []
    with tf.control_dependencies([self._h_min_t, self._h_max_t] ):
      avg_op = self._moving_averager.apply(
        [self._h_min_t, self._h_max_t])
      with tf.control_dependencies([avg_op]):
        if self._h_min_log_smooth:
          self._h_min = tf.exp(
            tf.identity(self._moving_averager.average(self._h_min_t)))
        else:
          self._h_min = \
            tf.identity(self._moving_averager.average(self._h_min_t))
        if self._h_max_log_smooth:
          self._h_max = tf.exp(
            tf.identity(self._moving_averager.average(self._h_max_t)))
        else:
          self._h_max = \
            tf.identity(self._moving_averager.average(self._h_max_t))
      if self._sparsity_debias:
        self._h_min = self._h_min * self._sparsity_avg
        self._h_max = self._h_max * self._sparsity_avg
    curv_range_ops.append(avg_op)
    return curv_range_ops

  def grad_variance(self):
    grad_var_ops = []
    tensor_to_avg = []
    for t, g in zip(self._tvars, self._grads):
      if isinstance(g, ops.IndexedSlices):
        tensor_to_avg.append(
          tf.reshape(tf.unsorted_segment_sum(
            g.values, g.indices, g.dense_shape[0]),
            shape=t.get_shape()))
      else:
        tensor_to_avg.append(g)
    avg_op = self._moving_averager.apply(tensor_to_avg)
    grad_var_ops.append(avg_op)
    with tf.control_dependencies([avg_op]):
      self._grad_avg = [
        self._moving_averager.average(val) for val in tensor_to_avg]
      self._grad_avg_squared = [tf.square(val) for val in self._grad_avg]
    self._grad_var = tf.maximum(
      tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype),
      self._grad_norm_squared_avg
      - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared] ) )
    if self._sparsity_debias:
      self._grad_var *= self._sparsity_avg
    return grad_var_ops

  def dist_to_opt(self):
    dist_to_opt_ops = []
    # running average of the norm of gradeint
    self._grad_norm = tf.sqrt(self._grad_norm_squared)
    avg_op = self._moving_averager.apply([self._grad_norm, ])
    dist_to_opt_ops.append(avg_op)
    with tf.control_dependencies([avg_op]):
      self._grad_norm_avg = self._moving_averager.average(
        self._grad_norm)
      # single iteration distance estimation
      # note that self._grad_norm_avg is per variable
      self._dist_to_opt = (self._grad_norm_avg
                 / (self._grad_norm_squared_avg + EPS) )
    # running average of distance
    avg_op = self._moving_averager.apply([self._dist_to_opt])
    dist_to_opt_ops.append(avg_op)
    with tf.control_dependencies([avg_op]):
      self._dist_to_opt_avg = tf.identity(
        self._moving_averager.average(self._dist_to_opt))
      if self._sparsity_debias:
        self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS)
    return dist_to_opt_ops

  def grad_sparsity(self):
    # If the sparse minibatch gradient has 10 percent of its entries
    # non-zero, its sparsity is 0.1.
    # The norm of dense gradient averaged from full dataset
    # are roughly estimated norm of minibatch
    # sparse gradient norm * sqrt(sparsity)
    # An extension maybe only correct the sparse blob.
    non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads])
    all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads])
    self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \
      / tf.cast(all_entry_cnt, self._grads[0].dtype)
    avg_op = self._moving_averager.apply([self._sparsity, ])
    with tf.control_dependencies([avg_op]):
      self._sparsity_avg = self._moving_averager.average(self._sparsity)
    return avg_op

  def before_apply(self):
    self._moving_averager = tf.train.ExponentialMovingAverage(
      decay=self._beta, zero_debias=self._zero_debias)
    assert self._grads is not None and len(self._grads) > 0
    before_apply_ops = []

    # get per var g**2 and norm**2
    self._grad_squared = []
    self._grad_norm_squared = []
    for v, g in zip(self._tvars, self._grads):
      if g is None:
        continue
      with ops.colocate_with(v):
        self._grad_squared.append(tf.square(g))
    self._grad_norm_squared = [
      tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared]

    if self._sparsity_debias:
      avg_op_sparsity = self.grad_sparsity()
      before_apply_ops.append(avg_op_sparsity)

    # the following running average on squared norm of gradient is shared
    # by `grad_variance` and `dist_to_opt`
    avg_op = self._moving_averager.apply(self._grad_norm_squared)
    with tf.control_dependencies([avg_op]):
      self._grad_norm_squared_avg = [self._moving_averager.average(val)
                                     for val in self._grad_norm_squared]
      self._grad_norm_squared = tf.add_n(self._grad_norm_squared)
      self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg)
    before_apply_ops.append(avg_op)

    with tf.control_dependencies([avg_op]):
      curv_range_ops = self.curvature_range()
      before_apply_ops += curv_range_ops
      grad_var_ops = self.grad_variance()
      before_apply_ops += grad_var_ops
      dist_to_opt_ops = self.dist_to_opt()
      before_apply_ops += dist_to_opt_ops
    return tf.group(*before_apply_ops)

  def get_lr_tensor(self):
    lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS)
    lr = tf.minimum(lr, lr * (tf.to_float(self._global_step) + 1.0) / 10.0 / tf.to_float(tf.constant(self._curv_win_width) ) )
    return lr

  def get_cubic_root(self):
    # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
    # where x = sqrt(mu).
    # We substitute x, which is sqrt(mu), with x = y + 1.
    # It gives y^3 + py = q
    # where p = (D^2 h_min^2)/(2*C) and q = -p.
    # We use the Vieta's substution to compute the root.
    # There is only one real solution y (which is in [0, 1] ).
    # http://mathworld.wolfram.com/VietasSubstitution.html
    # assert_array = \
    #   [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), 
    #   tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]), 
    #   tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]),
    #   tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), 
    #   tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]), 
    #   tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])]
    # with tf.control_dependencies(assert_array):
    # EPS in the numerator to prevent momentum being exactly one in case of 0 gradient
    p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS)
    w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
    w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0/3.0)
    y = w - p / 3.0 / (w + EPS)
    x = y + 1
    return x

  def get_mu_tensor(self):
    root = self.get_cubic_root()
    dr = tf.maximum( (self._h_max + EPS) / (self._h_min + EPS), 1.0 + EPS)
    mu = tf.maximum(
      root**2, ((tf.sqrt(dr) - 1) / (tf.sqrt(dr) + 1))**2)
    return mu

  def update_hyper_param(self):
    assign_hyper_ops = []
    self._mu = tf.identity(tf.cond(
      self._do_tune, lambda: self.get_mu_tensor(),
      lambda: self._mu_var))
    with tf.control_dependencies([self._mu]):
      self._lr = tf.identity(tf.cond(
        self._do_tune, lambda: self.get_lr_tensor(),
        lambda: self._lr_var))

    with tf.control_dependencies([self._mu, self._lr]):
      if self._use_unsmoothed_lr_mu:
        assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
        assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
      else:
        self._mu = self._beta * self._mu_var + (1 - self._beta) * self._mu
        self._lr = self._beta * self._lr_var + (1 - self._beta) * self._lr
        with tf.control_dependencies([self._mu, self._lr] ):
          assign_hyper_ops.append(tf.assign(self._mu_var, self._mu) )
          assign_hyper_ops.append(tf.assign(self._lr_var, self._lr) )
    assign_hyper_op = tf.group(*assign_hyper_ops)
    return assign_hyper_op

  def get_name(self):
      return self._optimizer.get_name()

  def apply_gradients(self, grads_tvars, global_step=None, name=None):
    self._grads, self._tvars = zip(
      *[(g, t) for g, t in grads_tvars if g is not None])

    # for manual gradient clipping
    if self._clip_thresh_var is not None:
      self._grads, self._grads_norm = tf.clip_by_global_norm(
        self._grads, self._clip_thresh_var)

    # loosely adaptive clipping of gradient in case exploding gradient ruins statistics
    if self._use_adapt_grad_clip:
      thresh = tf.cond(self._do_tune, 
        lambda: tf.sqrt(self._stat_protect_fac * self._adapt_grad_clip_thresh**2),
        lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
      self._grads, self._grads_norm = tf.clip_by_global_norm(self._grads, thresh)

    with tf.variable_scope("before_apply"):
      before_apply_op = self.before_apply()

    with tf.variable_scope("update_hyper"):
      with tf.control_dependencies([before_apply_op]):
        update_hyper_op = self.update_hyper_param()

    with tf.variable_scope("apply_updates"):
      with tf.control_dependencies([update_hyper_op]):

        # clip exploding gradient according to h_max
        if self._use_adapt_grad_clip:
          thresh = tf.cond(tf.greater(tf.global_norm(self._grads), 
            self._adapt_grad_clip_thresh), 
            lambda: self._adapt_grad_clip_target_val,
            lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
          self._grads, self._grads_norm = tf.clip_by_global_norm(
            self._grads, thresh)

        apply_grad_op = self._optimizer.apply_gradients(
          zip(self._grads, self._tvars), global_step, name)

    with tf.control_dependencies([apply_grad_op]):
      self._increment_global_step_op = tf.assign(
        self._global_step, self._global_step + 1)
      
      self._adapt_grad_clip_thresh_op = \
        tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
      self._adapt_grad_clip_target_val_op = \
        tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
      # self._adapt_grad_clip_target_val_op = \
      #   tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))

    return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
                    self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op,
                    self._increment_global_step_op)


  def compute_gradients(self, loss, var_list=None,
                        gate_gradients=GATE_OP,
                        aggregation_method=None,
                        colocate_gradients_with_ops=False,
                        grad_loss=None):
    return self._optimizer.compute_gradients(
      loss, var_list=var_list,
      gate_gradients=gate_gradients,
      aggregation_method=aggregation_method,
      colocate_gradients_with_ops=colocate_gradients_with_ops,
      grad_loss=grad_loss)

  def minimize(self, loss, global_step=None, var_list=None,
               gate_gradients=GATE_OP,
               aggregation_method=None,
               colocate_gradients_with_ops=False,
               name=None,
               grad_loss=None):
    """Add operations to minimize `loss` by updating `var_list`.

    This method simply combines calls `compute_gradients()` and
    `apply_gradients()`. If you want to process the gradient before
    applying them, call `tf.gradients()` and `self.apply_gradients()`
    explicitly instead of using this function.

    Adapted from Tensorflow Optimizer base class member function.
    """
    grads_and_vars = self._optimizer.compute_gradients(
      loss, var_list=var_list,
      gate_gradients=gate_gradients,
      aggregation_method=aggregation_method,
      colocate_gradients_with_ops=colocate_gradients_with_ops,
      grad_loss=grad_loss)

    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
    if not vars_with_grad:
      raise ValueError(
        "No gradients provided for any variable, check your graph for "
        "ops that do not support gradients, between variables "
        "%s and loss %s." %
        ([str(v) for _, v in grads_and_vars], loss))

    return self.apply_gradients(grads_and_vars, global_step, name)

  def get_slot(self, var, name):
    """
    Return a slot named `name` created for `var` by
    the underlying MomentumOptimizer.

    Args:
      var: A variable passed to `minimize()` or `apply_gradients()`.
      name: A string.

    Returns:
      The `Variable` for the slot if it was created, `None` otherwise.
    """
    return self._optimizer.get_slot(var, name)

  def get_slot_names(self):
    """
    Return a list of the names of the slots created by the
    underlying MomentumOptimizer.

    Returns:
      A list of strings.
    """
    return self._optimizer.get_slot_names()