Python keras.backend.update() Examples

The following are code examples for showing how to use keras.backend.update(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: E3Outlier   Author: demonzyj56   File: dagmm.py    MIT License 6 votes vote down vote up
def call(self, inputs, training=None):
        z, gamma_k = inputs

        gamma_k_sum = K.sum(gamma_k)
        est_phi = K.mean(gamma_k, axis=0)
        est_mu = K.dot(K.transpose(gamma_k), z) / gamma_k_sum
        est_sigma = K.dot(K.transpose(z - est_mu),
                          gamma_k * (z - est_mu)) / gamma_k_sum

        # est_sigma = est_sigma + (K.random_normal(shape=(K.int_shape(z)[1], 1), mean=1e-3, stddev=1e-4) * K.eye(K.int_shape(z)[1]))
        est_sigma = est_sigma + K.epsilon() * K.eye(K.int_shape(z)[1])

        self.add_update(K.update(self.phi, est_phi), inputs)
        self.add_update(K.update(self.mu, est_mu), inputs)
        self.add_update(K.update(self.sigma, est_sigma), inputs)

        est_sigma_diag_inv = K.eye(K.int_shape(self.sigma)[0]) / est_sigma
        self.add_loss(self.lambd_diag * K.sum(est_sigma_diag_inv), inputs)

        phi = K.in_train_phase(est_phi, self.phi, training)
        mu = K.in_train_phase(est_mu, self.mu, training)
        sigma = K.in_train_phase(est_sigma, self.sigma, training)
        return GaussianMixtureComponent._calc_component_density(z, phi, mu, sigma) 
Example 2
Project: keras-lookahead   Author: CyberZHG   File: optimizers.py    MIT License 6 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = self.learning_rate * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            self.updates.append(K.update_sub(p, p_t))
        return self.updates 
Example 3
Project: EigenPro-tensorflow   Author: EigenPro   File: optimizers.py    MIT License 6 votes vote down vote up
def get_updates(self, loss, params):
        self.updates = []
        grads = self.get_gradients(loss, [self.pred_t])

        eta = self.eta
        index = self.index_t
        eigenpro_f = self.eigenpro_f

        shapes = [K.get_variable_shape(p) for p in params]
        for p, g in zip(params, grads):
            update_p = K.gather(p, index) - eta * g
            new_p = scatter_update(p, index, update_p)
            
            if eigenpro_f:
                new_p = new_p + eta * eigenpro_f(g)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 4
Project: EigenPro-tensorflow   Author: EigenPro   File: optimizers.py    MIT License 6 votes vote down vote up
def get_updates(self, loss, params):
        self.updates = []
        grads = self.get_gradients(loss, params)

        eta = self.eta
        eigenpro_f = self.eigenpro_f

        shapes = [K.get_variable_shape(p) for p in params]
        for p, g in zip(params, grads):
            new_p = p - eta * g

            if eigenpro_f:
                new_p = new_p + eta * eigenpro_f(g)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 5
Project: AnomalyDetectionTransformations   Author: izikgo   File: dagmm.py    MIT License 6 votes vote down vote up
def call(self, inputs, training=None):
        z, gamma_k = inputs

        gamma_k_sum = K.sum(gamma_k)
        est_phi = K.mean(gamma_k, axis=0)
        est_mu = K.dot(K.transpose(gamma_k), z) / gamma_k_sum
        est_sigma = K.dot(K.transpose(z - est_mu),
                          gamma_k * (z - est_mu)) / gamma_k_sum

        est_sigma = est_sigma + (K.random_normal(shape=(K.int_shape(z)[1], 1), mean=1e-3, stddev=1e-4) * K.eye(K.int_shape(z)[1]))

        self.add_update(K.update(self.phi, est_phi), inputs)
        self.add_update(K.update(self.mu, est_mu), inputs)
        self.add_update(K.update(self.sigma, est_sigma), inputs)

        est_sigma_diag_inv = K.eye(K.int_shape(self.sigma)[0]) / est_sigma
        self.add_loss(self.lambd_diag * K.sum(est_sigma_diag_inv), inputs)

        phi = K.in_train_phase(est_phi, self.phi, training)
        mu = K.in_train_phase(est_mu, self.mu, training)
        sigma = K.in_train_phase(est_sigma, self.sigma, training)
        return GaussianMixtureComponent._calc_component_density(z, phi, mu, sigma) 
Example 6
Project: keras-lamb   Author: CyberZHG   File: optimizer.py    MIT License 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        t = K.cast(self.iterations, K.floatx()) + 1
        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * t))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            mhat_t = m_t / (1. - K.pow(self.beta_1, t))
            vhat_t = v_t / (1. - K.pow(self.beta_2, t))

            u_t = mhat_t / K.sqrt(vhat_t + self.epsilon) + self.weight_decay * p
            trust_ratio = K.sqrt(K.sum(K.square(p)) / K.sum(K.square(u_t)))
            trust_ratio = K.minimum(K.maximum(trust_ratio, self.lower_bound), self.upper_bound)

            lr_p = trust_ratio * lr
            new_p = p - lr_p * u_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 7
Project: deep-pmsm   Author: wkirgsn   File: custom_layers.py    MIT License 5 votes vote down vote up
def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler):
    ps = K.get_variable_shape(new_V_param)
    norm_axes = [i for i in range(len(ps) - 1)]

    # update W and V_scaler
    new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes))
    new_V_scaler = new_g_param / new_V_norm
    new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param
    updates.append(K.update(W, new_W))
    updates.append(K.update(V_scaler, new_V_scaler))


# data based initialization for a given Keras model 
Example 8
Project: deep-pmsm   Author: wkirgsn   File: custom_layers.py    MIT License 5 votes vote down vote up
def data_based_init(model, input):

    # input can be dict, numpy array, or list of numpy arrays
    if type(input) is dict:
        feed_dict = input
    elif type(input) is list:
        feed_dict = {tf_inp: np_inp for tf_inp,np_inp in zip(model.inputs,input)}
    else:
        feed_dict = {model.inputs[0]: input}

    # add learning phase if required
    if model.uses_learning_phase and K.learning_phase() not in feed_dict:
        feed_dict.update({K.learning_phase(): 1})

    # get all layer name, output, weight, bias tuples
    layer_output_weight_bias = []
    for l in model.layers:
        if hasattr(l, 'W') and hasattr(l, 'b'):
            assert(l.built)
            layer_output_weight_bias.append( (l.name,l.get_output_at(0),l.W,l.b) ) # if more than one node, only use the first

    # iterate over our list and do data dependent init
    sess = K.get_session()
    for l,o,W,b in layer_output_weight_bias:
        print('Performing data dependent initialization for layer ' + l)
        m,v = tf.nn.moments(o, [i for i in range(len(o.get_shape())-1)])
        s = tf.sqrt(v + 1e-10)
        updates = tf.group(W.assign(W/tf.reshape(s,[1]*(len(W.get_shape())-1)+[-1])), b.assign((b-m)/s))
        sess.run(updates, feed_dict) 
Example 9
Project: keras-optimizers   Author: hi-im-ryanli   File: frankenstein.py    MIT License 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        t = K.cast(self.iterations, K.floatx()) + 1

        # Due to the recommendations in [2], i.e. warming momentum schedule
        momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
        momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
        m_schedule_new = self.m_schedule * momentum_cache_t
        m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
        self.updates.append((self.m_schedule, m_schedule_new))

        shapes = [K.int_shape(p) for p in params]
        ms = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]

        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            # the following equations given in [1]
            g_prime = g / (1. - m_schedule_new)
            m_t = self.beta_1 * m + (1. - self.beta_1) * g
            m_t_prime = m_t / (1. - m_schedule_next)
            v_t = K.maximum(self.beta_2 * v, K.abs(g))
            v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
            m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 10
Project: Keras-TextClassification   Author: yongzhuo   File: keras_radam.py    MIT License 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        beta_1_t = K.pow(self.beta_1, t)
        beta_2_t = K.pow(self.beta_2, t)
        rho = 2 / (1 - self.beta_2) - 1
        rho_t = rho - 2 * t * beta_2_t / (1 - beta_2_t)
        r_t = K.sqrt(
            K.relu(rho_t - 4) * K.relu(rho_t - 2) * rho / ((rho - 4) * (rho - 2) * rho_t)
        )
        flag = K.cast(rho_t > 4, K.floatx())

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            mhat_t = m_t / (1 - beta_1_t)
            vhat_t = K.sqrt(v_t / (1 - beta_2_t))
            p_t = p - lr * mhat_t * (flag * r_t / (vhat_t + self.epsilon) + (1 - flag))

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 11
Project: keras-contrib   Author: keras-team   File: lars.py    MIT License 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        weights = self.get_weights()
        self.updates = [K.update_add(self.iterations, 1)]
        scaled_lr = self.lr
        w_norm = K.sqrt(K.sum([K.sum(K.square(weight))
                               for weight in weights]))
        g_norm = K.sqrt(K.sum([K.sum(K.square(grad))
                               for grad in grads]))
        scaled_lr = K.switch(K.greater(w_norm * g_norm, K.zeros([1])),
                             K.expand_dims((self.eeta * w_norm /
                                            (g_norm + self.weight_decay * w_norm +
                                             self.epsilon)) * self.lr),
                             K.ones([1]) * self.lr)
        if K.backend() == 'theano':
            scaled_lr = scaled_lr[0]  # otherwise theano raise broadcasting error
        # momentum
        moments = [K.zeros(K.int_shape(param), dtype=K.dtype(param))
                   for param in params]
        self.weights = [self.iterations] + moments
        for param, grad, moment in zip(params, grads, moments):
            v0 = (moment * self.momentum)
            v1 = scaled_lr * grad  # velocity
            veloc = v0 - v1
            self.updates.append(K.update(moment, veloc))

            if self.nesterov:
                new_param = param + (veloc * self.momentum) - v1
            else:
                new_param = param + veloc

            # Apply constraints.
            if getattr(param, 'constraint', None) is not None:
                new_param = param.constraint(new_param)

            self.updates.append(K.update(param, new_param))
        return self.updates 
Example 12
Project: keras-contrib   Author: keras-team   File: yogi.py    MIT License 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            g2 = K.square(g)
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 13
Project: keras-contrib   Author: keras-team   File: ftml.py    MIT License 5 votes vote down vote up
def __init__(self, lr=0.0025, beta_1=0.6, beta_2=0.999,
                 epsilon=1e-8, decay=0., **kwargs):
        super(FTML, self).__init__(**kwargs)
        self.__dict__.update(locals())
        self.iterations = K.variable(0)
        self.lr = K.variable(lr)
        self.beta_1 = K.variable(beta_1)
        self.beta_2 = K.variable(beta_2)
        self.decay = K.variable(decay)
        self.epsilon = epsilon
        self.inital_decay = decay 
Example 14
Project: keras-contrib   Author: keras-team   File: ftml.py    MIT License 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.inital_decay > 0:
            lr *= (1. / (1. + self.decay * self.iterations))

        t = self.iterations + 1

        lr_t = lr / (1. - K.pow(self.beta_1, t))

        shapes = [K.int_shape(p) for p in params]
        zs = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]
        ds = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + zs + vs + ds

        for p, g, z, v, d in zip(params, grads, zs, vs, ds):
            v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
            d_t = (K.sqrt(v_t / (1. - K.pow(self.beta_2, t)))
                   + self.epsilon) / lr_t
            sigma_t = d_t - self.beta_1 * d
            z_t = self.beta_1 * z + (1. - self.beta_1) * g - sigma_t * p

            p_t = - z_t / d_t

            self.updates.append(K.update(z, z_t))
            self.updates.append(K.update(v, v_t))
            self.updates.append(K.update(d, d_t))

            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 15
Project: EUNN-theano   Author: iguanaus   File: custom_optimizers.py    MIT License 5 votes vote down vote up
def __init__(self, lr=0.001, rho=0.9, epsilon=1e-8, decay=0., lr_natGrad=None,
                 **kwargs):
        super(RMSprop_and_natGrad, self).__init__(**kwargs)
        self.__dict__.update(locals())
        self.lr = K.variable(lr)
        if lr_natGrad is None:
            self.lr_natGrad = K.variable(lr)
        else:
            self.lr_natGrad = K.variable(lr_natGrad)
        self.rho = K.variable(rho)
        self.decay = K.variable(decay)
        self.inital_decay = decay
        self.iterations = K.variable(0.) 
Example 16
Project: faceswap   Author: deepfakes   File: optimizers.py    GNU General Public License v3.0 5 votes vote down vote up
def update_1(self, params):
        """ First update on CPU or GPU """
        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        return ms, vs, vhats 
Example 17
Project: EAST   Author: kurapan   File: adamw.py    GNU General Public License v3.0 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        wd = self.wd # decoupled weight decay (3/4)

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p # decoupled weight decay (4/4)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 18
Project: temporal-ensembling-semi-supervised   Author: hiram64   File: weight_norm.py    MIT License 5 votes vote down vote up
def add_weightnorm_param_updates(updates, new_V_param, new_g_param, W, V_scaler):
    ps = K.get_variable_shape(new_V_param)
    norm_axes = [i for i in range(len(ps) - 1)]

    # update W and V_scaler
    new_V_norm = tf.sqrt(tf.reduce_sum(tf.square(new_V_param), norm_axes))
    new_V_scaler = new_g_param / new_V_norm
    new_W = tf.reshape(new_V_scaler, [1] * len(norm_axes) + [-1]) * new_V_param
    updates.append(K.update(W, new_W))
    updates.append(K.update(V_scaler, new_V_scaler)) 
Example 19
Project: AnomalyDetectionTransformations   Author: izikgo   File: wide_residual_network.py    MIT License 5 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + moments
        for p, g, m in zip(params, grads, moments):
            v = self.momentum * m + g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                new_p = p - lr * (self.momentum * v + g)
            else:
                new_p = p - lr * v

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 20
Project: bert_lamb_pretrain   Author: goldenbili   File: keras_lamb.py    Apache License 2.0 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            m_t_hat = m_t / (1. - K.pow(self.beta_1, t))
            v_t_hat = v_t / (1. - K.pow(self.beta_2, t))

            p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon))

            if self.weight_decay > 0.:
                wd = self.weight_decay * p
                p_dash = p_dash + wd

            r1 = K.sqrt(K.sum(K.square(p)))
            r2 = K.sqrt(K.sum(K.square(p_dash)))

            r = tf.where(tf.greater(r1, 0.),
                         tf.where(tf.greater(r2, 0.),
                                  r1 / r2,
                                  1.0),
                         1.0)
            # r = r1 / r2
            eta = r * lr

            p_t = p - eta * p_dash

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 21
Project: bert_lamb_pretrain   Author: goldenbili   File: keras_lamb.py    Apache License 2.0 4 votes vote down vote up
def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """See base class."""
        assignments = []
        for (grad, param) in grads_and_vars:
            if grad is None or param is None:
                continue

            param_name = self._get_variable_name(param.name)

            m = tf.get_variable(
                name=param_name + "/adam_m",
                shape=param.shape.as_list(),
                dtype=tf.float32,
                trainable=False,
                initializer=tf.zeros_initializer())
            v = tf.get_variable(
                name=param_name + "/adam_v",
                shape=param.shape.as_list(),
                dtype=tf.float32,
                trainable=False,
                initializer=tf.zeros_initializer())

            # Standard Adam update.
            next_m = (
                    tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
            next_v = (
                    tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
                                                              tf.square(grad)))

            update = next_m / (tf.sqrt(next_v) + self.epsilon)

            # Just adding the square of the weights to the loss function is *not*
            # the correct way of using L2 regularization/weight decay with Adam,
            # since that will interact with the m and v parameters in strange ways.
            #
            # Instead we want ot decay the weights in a manner that doesn't interact
            # with the m/v parameters. This is equivalent to adding the square
            # of the weights to the loss with plain (non-momentum) SGD.
            if self._do_use_weight_decay(param_name):
                update += self.weight_decay_rate * param

            update_with_lr = self.learning_rate * update

            next_param = param - update_with_lr

            assignments.extend(
                [param.assign(next_param),
                 m.assign(next_m),
                 v.assign(next_v)])
        return tf.group(*assignments, name=name) 
Example 22
Project: keras_radam   Author: forin-xyz   File: radam.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay:
            lr = lr * (1. / (1. + self.decay * K.cast(
                self.iterations, K.dtype(self.decay)
            )))

        t = K.cast(self.iterations, K.floatx()) + 1.
        beta_1 = self.beta_1
        beta_2 = self.beta_2
        beta_1_t = K.pow(beta_1, t)
        beta_2_t = K.pow(beta_2, t)
        rho_inf = 2. / (1. - beta_2) - 1.
        rho_t = rho_inf - 2. * t * beta_2_t / (1. - beta_2_t)
        r_t = K.sqrt(
            K.relu(rho_t - 4.) * (rho_t - 2.) * rho_inf / (
                (rho_inf - 4.) * (rho_inf - 2.) * rho_t )
        )
        flag = K.cast(rho_t > 4., K.floatx())

        ms = [K.zeros(K.int_shape(p)) for p in params]
        vs = [K.zeros(K.int_shape(p)) for p in params]

        self.weights = [self.iterations] + ms + vs
        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = beta_1 * m + (1. - beta_1) * g
            v_t = beta_2 * v + (1. - beta_2) * K.square(g)

            m_hat_t = m_t / (1. - beta_1_t)
            v_hat_t = K.sqrt(v_t / (1. - beta_2_t))
            new_p = p - lr * (r_t / (v_hat_t + self.epsilon) + flag - 1.)* m_hat_t

            if getattr(p, "constraint", None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
        return self.updates 
Example 23
Project: keras-adamw   Author: OverLordGoldDragon   File: optimizers.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.updates.append(K.update_add(self.t_cur, 1))

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape, name='moment_' + str(i))
                   for (i, shape) in enumerate(shapes)]
        self.weights = [self.iterations] + moments

        total_iterations = self.total_iterations
        # Cosine annealing
        if self.use_cosine_annealing and total_iterations != 0:
            self.eta_t = _compute_eta_t(self)
        self.lr_t = lr * self.eta_t  # for external tracking

        for p, g, m in zip(params, grads, moments):
            # Learning rate multipliers
            lr_t = self.learning_rate
            if self.lr_multipliers is not None:
                lr_t = _apply_lr_multiplier(self, lr_t, p)

            v = self.momentum * m - self.eta_t * lr_t * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                p_t = p + self.momentum * v - self.eta_t * lr_t * g
            else:
                p_t = p + v

            # Weight decays
            if p.name in self.weight_decays.keys() and total_iterations != 0:
                p_t = _apply_weight_decays(self, p, p_t)
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))

        self._init_notified = True
        return self.updates 
Example 24
Project: keras-adamw   Author: OverLordGoldDragon   File: optimizers_225.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.updates.append(K.update_add(self.t_cur, 1))

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]

        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        total_iterations = self.total_iterations
        # Cosine annealing
        if self.use_cosine_annealing and total_iterations != 0:
            self.eta_t = _compute_eta_t(self)
        self.lr_t = lr_t * self.eta_t  # for external tracking
        lr_t_premult = lr_t

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            # Learning rate multipliers
            if self.lr_multipliers is not None:
                lr_t = _apply_lr_multiplier(self, lr_t_premult, p)

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            # Weight decays
            if p.name in self.weight_decays.keys() and total_iterations != 0:
                p_t = _apply_weight_decays(self, p, p_t)
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))

        self._init_notified = True
        return self.updates 
Example 25
Project: keras-adamw   Author: OverLordGoldDragon   File: optimizers_225.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.updates.append(K.update_add(self.t_cur, 1))

        t = K.cast(self.iterations, K.floatx()) + 1

        # Due to the recommendations in [2], i.e. warming momentum schedule
        momentum_cache_t = self.beta_1 * (1. - 0.5 * (
            K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
        momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (
            K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
        m_schedule_new = self.m_schedule * momentum_cache_t
        m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
        self.updates.append((self.m_schedule, m_schedule_new))

        shapes = [K.int_shape(p) for p in params]
        ms = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]

        self.weights = [self.iterations] + ms + vs

        total_iterations = self.total_iterations
        # Cosine annealing
        if self.use_cosine_annealing and total_iterations != 0:
            self.eta_t = _compute_eta_t(self)
        self.lr_t = self.lr * self.eta_t  # for external tracking

        for p, g, m, v in zip(params, grads, ms, vs):
            # Learning rate multipliers
            lr_t = self.lr
            if self.lr_multipliers is not None:
                lr_t = _apply_lr_multiplier(self, lr_t, p)

            # the following equations given in [1]
            g_prime = g / (1. - m_schedule_new)
            m_t = self.beta_1 * m + (1. - self.beta_1) * g
            m_t_prime = m_t / (1. - m_schedule_next)
            v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
            v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
            m_t_bar = (1. - momentum_cache_t) * g_prime + (
                momentum_cache_t_1 * m_t_prime)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            p_t = p - self.eta_t * lr_t * m_t_bar / (
                    K.sqrt(v_t_prime) + self.epsilon)

            # Weight decays
            if p.name in self.weight_decays.keys() and total_iterations != 0:
                p_t = _apply_weight_decays(self, p, p_t)
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))

        self._init_notified = True
        return self.updates 
Example 26
Project: keras-adamw   Author: OverLordGoldDragon   File: optimizers_225.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.updates.append(K.update_add(self.t_cur, 1))

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))
        # momentum
        shapes = [K.int_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        self.weights = [self.iterations] + moments

        total_iterations = self.total_iterations
        # Cosine annealing
        if self.use_cosine_annealing and total_iterations != 0:
            self.eta_t = _compute_eta_t(self)
        self.lr_t = lr * self.eta_t  # for external tracking

        for p, g, m in zip(params, grads, moments):
            # Learning rate multipliers
            lr_t = self.lr
            if self.lr_multipliers is not None:
                lr_t = _apply_lr_multiplier(self, lr_t, p)

            v = self.momentum * m - self.eta_t * lr_t * g  # velocity
            self.updates.append(K.update(m, v))

            if self.nesterov:
                p_t = p + self.momentum * v - self.eta_t * lr_t * g
            else:
                p_t = p + v

            # Weight decays
            if p.name in self.weight_decays.keys() and total_iterations != 0:
                p_t = _apply_weight_decays(self, p, p_t)
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))

        self._init_notified = True
        return self.updates 
Example 27
Project: keras_extension   Author: k1414st   File: optimizers.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            eta_l_t = self.terminal_bound - \
                (self.terminal_bound - self.lower_bound) / \
                ((1. - self.beta_2) * t + 1)
            eta_u_t = self.terminal_bound + \
                (self.upper_bound - self.terminal_bound) / \
                ((1. - self.beta_2) * t)

            clipped_lr_t = K.minimum(
                K.maximum(lr_t / (K.sqrt(v_t) + self.epsilon), eta_l_t), eta_u_t)
            p_t = p - clipped_lr_t * m_t

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 28
Project: keras_extension   Author: k1414st   File: optimizers.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            eta_l_t = self.terminal_bound - \
                (self.terminal_bound - self.lower_bound) / \
                ((1. - self.beta_2) * t + 1)
            eta_u_t = self.terminal_bound + \
                (self.upper_bound - self.terminal_bound) / \
                ((1. - self.beta_2) * t)

            clipped_lr_t = K.minimum(
                K.maximum(lr_t / (K.sqrt(v_t) + self.epsilon), eta_l_t), eta_u_t)
            p_t = p - clipped_lr_t * m_t

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 29
Project: Coloring-greyscale-images   Author: emilwallner   File: AdamAccumulate.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr

        completed_updates = K.cast(K.tf.floor(self.iterations / self.accum_iters), K.floatx())

        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * completed_updates))

        t = completed_updates + 1

        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))

        # self.iterations incremented after processing a batch
        # batch:              1 2 3 4 5 6 7 8 9
        # self.iterations:    0 1 2 3 4 5 6 7 8
        # update_switch = 1:        x       x    (if accum_iters=4)  
        update_switch = K.equal((self.iterations + 1) % self.accum_iters, 0)
        update_switch = K.cast(update_switch, K.floatx())

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]

        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]

        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat, tg in zip(params, grads, ms, vs, vhats, gs):

            sum_grad = tg + g
            avg_grad = sum_grad / self.accum_iters_float

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * avg_grad
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(avg_grad)

            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, (1 - update_switch) * vhat + update_switch * vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, (1 - update_switch) * m + update_switch * m_t))
            self.updates.append(K.update(v, (1 - update_switch) * v + update_switch * v_t))
            self.updates.append(K.update(tg, (1 - update_switch) * sum_grad))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, (1 - update_switch) * p + update_switch * new_p))
        return self.updates 
Example 30
Project: keras-optimizers   Author: hi-im-ryanli   File: frankenstein.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        t = K.cast(self.iterations, K.floatx()) + 1

        # Due to the recommendations in [2], i.e. warming momentum schedule
        momentum_cache_t = self.beta_1 * (1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
        momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
        m_schedule_new = self.m_schedule * momentum_cache_t
        m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
        self.updates.append((self.m_schedule, m_schedule_new))

        shapes = [K.int_shape(p) for p in params]
        ms = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]

        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            # the following equations given in [1]
            g_prime = g / (1. - m_schedule_new)
            m_t = self.beta_1 * m + (1. - self.beta_1) * g
            m_t_prime = m_t / (1. - m_schedule_next)
            if np.random.choice([1, -1]) == 1:
                v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
            else:
                v_t = K.maximum(self.beta_2 * v, K.abs(g))
            v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
            m_t_bar = (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 31
Project: keras-optimizers   Author: hi-im-ryanli   File: frankenstein.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        shapes = [K.int_shape(p) for p in params]
        accumulators = [K.zeros(shape) for shape in shapes]
        delta_accumulators = [K.zeros(shape) for shape in shapes]

        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]

        self.weights = [self.iterations] + ms + vs

        for p, g, m, v, a, d_a in zip(params, grads, ms, vs, accumulators, delta_accumulators):
            # update accumulator
            new_a = self.rho * a + (1. - self.rho) * K.square(g)
            self.updates.append(K.update(a, new_a))

            # use the new accumulator and the *old* delta_accumulator
            update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * update
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(update)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 32
Project: FormicID   Author: naturalis   File: optimizer.py    MIT License 4 votes vote down vote up
def get_updates(self, params, constraints, loss):
        grads = self.get_gradients(loss, params)

        self.updates = [K.update_add(self.iterations, 1)]
        t = self.iterations + 1

        shapes = [K.get_variable_shape(p) for p in params]
        ms = [K.zeros(shape) for shape in shapes]
        vs = [K.zeros(shape) for shape in shapes]

        loss_prev = K.variable(0)
        self.updates.append(K.update(loss_prev, loss))

        # Calculate the numerator of the Eve coefficient
        d_num_t = K.abs(loss_prev - loss)
        self.updates.append(K.update(self.d_num, d_num_t))

        # Calculate the denominator of the Eve coefficient
        d_den_t = K.abs(K.minimum(loss_prev, loss) - self.loss_min)
        self.updates.append(K.update(self.d_den, d_den_t))

        # Calculate the Eve coefficient. At the first iteration, it is 1.
        d_tilde_t = K.clip(
            (d_num_t + self.fmin_pos) / (d_den_t + self.fmin_pos),
            1. / self.c,
            self.c,
        )
        d_t = (self.beta_3 * self.d) + (1. - self.beta_3) * d_tilde_t
        d_t = K.switch(K.greater(t, 1), d_t, K.constant(1))
        self.updates.append(K.update(self.d, d_t))

        # Calculate the effective learning rate as lr / (d * decay)
        lr_eff_t = self.lr / (d_t * (1. + (self.iterations * self.decay)))
        self.updates.append(K.update(self.lr_eff, lr_eff_t))

        # Apply bias correction to the learning rate
        lr_hat_t = (
            lr_eff_t
            * K.sqrt(1. - K.pow(self.beta_2, t))
            / (1. - K.pow(self.beta_1, t))
        )

        # Update per parameter
        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            self.updates.append(K.update(m, m_t))

            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            self.updates.append(K.update(v, v_t))

            p_t = p - lr_hat_t * m_t / (K.sqrt(v_t) + self.epsilon)
            new_p = p_t
            # Apply constraints
            if p in constraints:
                c = constraints[p]
                new_p = c(new_p)
            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 33
Project: keras-adabound   Author: titu1994   File: adabound.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1

        # Applies bounds on actual learning rate
        step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                          (1. - K.pow(self.beta_1, t)))

        final_lr = self.final_lr * lr / self.base_lr
        lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.))
        upper_bound = final_lr * (1. + 1. / (self.gamma * t))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsbound:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            # apply weight decay
            if self.weight_decay != 0.:
                g += self.weight_decay * K.stop_gradient(p)

            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            if self.amsbound:
                vhat_t = K.maximum(vhat, v_t)
                denom = (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                denom = (K.sqrt(v_t) + self.epsilon)

            # Compute the bounds
            step_size_p = step_size * K.ones_like(denom)
            step_size_p_bound = step_size_p / denom
            bounded_lr_t = m_t * K.minimum(K.maximum(step_size_p_bound,
                                                     lower_bound), upper_bound)

            p_t = p - bounded_lr_t

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 34
Project: CDSGD   Author: SCSLabISU   File: EASGD.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_updates(self, params, constraints, loss):
        parameter_mean=globals()["parameter_mean"]
        epoch_count=globals()["epoch_count"]

        copy_shapes = [K.get_variable_shape(p) for p in params]
        parameter_copy = [K.zeros(copy_shape) for copy_shape in copy_shapes]
        parameter_copy=params

        grads = self.get_gradients(loss, parameter_copy)
        self.updates = []

        lr = self.lr
        alpha=self.alpha
        communication_period=self.communication_period
        if self.initial_decay > 0:
            lr *= (1. / K.sqrt(1. + self.decay * self.iterations))
            self.updates .append(K.update_add(self.iterations, 1))

        # momentum
        shapes = [K.get_variable_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]



        self.weights = [self.iterations] + moments
        temp_param_mean=[]
        for p, pi, pm, g, m in zip(params,parameter_copy, parameter_mean, grads, moments):
        # for i in range(len(params)):
        #     p=params[i]
        #     p1=parameter_copy[i]
        #     pm=parameter_mean[i]
        #     g=grads[i]
        #     m=moments[i]

            # Momentum term
            #v = self.momentum * m - lr * g  # velocity
            v = - lr * g # no momentum

            self.updates.append(K.update(m, v))
            if self.nesterov:
                if (epoch_count%communication_period)==0:
                    new_pm = pm + alpha*(pi-pm)
                    new_p = p - alpha*(pi-pm) + self.momentum * v - lr * g
                    temp_param_mean.append(new_p)
                else:
                    new_p = p + self.momentum * v - lr * g
            else:
                if (epoch_count%communication_period)==0:
                    new_pm = pm + alpha*(pi-pm)
                    new_p = p - alpha*(pi-pm) + v
                    temp_param_mean.append(new_pm)
                else:
                    new_p = p + v
            # apply constraints
            if p in constraints:
                c = constraints[p]
                new_p = c(new_p)

            self.updates.append(K.update(p, new_p))
        parameter_mean=temp_param_mean
        return self.updates 
Example 35
Project: CDSGD   Author: SCSLabISU   File: CDSGD.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_updates(self, params, constraints, loss):
        pi=self.pi
        n_agents=self.n_agents
        agent_id=self.agent_id
        parameters=globals()["parameters"]
        info_shapes = [K.get_variable_shape(p) for p in params]
        parameter_copy = [0 for nb in range(n_agents)]
        for nb in range(n_agents):
            parameter_copy[nb]=parameters[nb]
            if (nb==agent_id):
                parameter_copy[nb]=params


        grads = self.get_gradients(loss, params)
        self.updates = []

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / K.sqrt(1. + self.decay * self.iterations))
            self.updates .append(K.update_add(self.iterations, 1))

        # momentum
        shapes = [K.get_variable_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        dist_accumulator = [K.zeros(shape) for shape in shapes]



        self.weights = [self.iterations] + moments
        #for p, g, m, d, p_agents in zip(params, grads, moments, dist_accumulator, *parameter_copy):
        for i in range(len(params)):
            p=params[i]
            g=grads[i]
            m=moments[i]
            d=dist_accumulator[i]
            # Momentum term
            v = self.momentum * m - lr * g  # velocity
            #v = - lr * g # no momentum
            self.updates.append(K.update(m, v))
            if self.nesterov:
                for nb in range(n_agents):
                    d+=pi[nb][agent_id]*parameter_copy[nb][i]
                new_p = d + self.momentum * v - lr * g
            else:
                # This is for Debug only
                # if count>5:
                #     raise ValueError('parameters: ' + str(p1) + str(p2) + str(p3) + str(p4) + str(p5)  )

                for nb in range(n_agents):
                    d+=pi[nb][agent_id]*parameter_copy[nb][i]
                    #raise ValueError('pi:' + str(K.eval(pi)))
                new_p = d + v
            # apply constraints
            if p in constraints:
                c = constraints[p]
                new_p = c(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 36
Project: CDSGD   Author: SCSLabISU   File: CDMSGD.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def get_updates(self, params, constraints, loss):
        pi=self.pi
        n_agents=self.n_agents
        agent_id=self.agent_id
        parameters=globals()["parameters"]
        info_shapes = [K.get_variable_shape(p) for p in params]
        parameter_copy = [0 for i in range(n_agents)]
        for nb in range(n_agents):
            parameter_copy[nb]=parameters[nb]
            if (nb==agent_id):
                parameter_copy[nb]=params


        grads = self.get_gradients(loss, params)
        self.updates = []

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / K.sqrt(1. + self.decay * self.iterations))
            self.updates .append(K.update_add(self.iterations, 1))

        # momentum
        shapes = [K.get_variable_shape(p) for p in params]
        moments = [K.zeros(shape) for shape in shapes]
        dist_accumulator = [K.zeros(shape) for shape in shapes]



        self.weights = [self.iterations] + moments
        #for p, g, m, d, p_agents in zip(params, grads, moments, dist_accumulator, *parameter_copy):
        for i in range(len(params)):
            p=params[i]
            g=grads[i]
            m=moments[i]
            d=dist_accumulator[i]
            # Momentum term
            #v = self.momentum * m - lr * g  # velocity
            v = self.momentum * m - lr * g  # velocity
            self.updates.append(K.update(m, v))
            if self.nesterov:
                for nb in range(n_agents):
                    d+=pi[nb][agent_id]*parameter_copy[nb][i]
                new_p = d + self.momentum * v - lr * g
            else:
                # This is for Debug only
                # if count>5:
                #     raise ValueError('parameters: ' + str(p1) + str(p2) + str(p3) + str(p4) + str(p5)  )

                for nb in range(n_agents):
                    d+=pi[nb][agent_id]*parameter_copy[nb][i]
                new_p = d + v
            # apply constraints
            if p in constraints:
                c = constraints[p]
                new_p = c(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 37
Project: Keras-TextClassification   Author: yongzhuo   File: keras_lookahead.py    MIT License 4 votes vote down vote up
def inject(self, model):
        """Inject the Lookahead algorithm for the given model.
        The following code is modified from keras's _make_train_function method.
        See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
        """
        if not hasattr(model, 'train_function'):
            raise RuntimeError('You must compile your model before using it.')

        model._check_trainable_weights_consistency()

        if model.train_function is None:
            inputs = (model._feed_inputs +
                      model._feed_targets +
                      model._feed_sample_weights)
            if model._uses_dynamic_learning_phase():
                inputs += [K.learning_phase()]
            fast_params = model._collected_trainable_weights

            with K.name_scope('training'):
                with K.name_scope(model.optimizer.__class__.__name__):
                    training_updates = model.optimizer.get_updates(
                        params=fast_params,
                        loss=model.total_loss)
                    slow_params = [K.variable(p) for p in fast_params]
                fast_updates = (model.updates +
                                training_updates +
                                model.metrics_updates)

                slow_updates, copy_updates = [], []
                for p, q in zip(fast_params, slow_params):
                    slow_updates.append(K.update(q, q + self.alpha * (p - q)))
                    copy_updates.append(K.update(p, q))

                # Gets loss and metrics. Updates weights at each call.
                fast_train_function = K.function(
                    inputs,
                    [model.total_loss] + model.metrics_tensors,
                    updates=fast_updates,
                    name='fast_train_function',
                    **model._function_kwargs)

                def F(inputs):
                    self.count += 1
                    R = fast_train_function(inputs)
                    if self.count % self.k == 0:
                        K.batch_get_value(slow_updates)
                        K.batch_get_value(copy_updates)
                    return R

                model.train_function = F 
Example 38
Project: keras_novograd   Author: titu1994   File: novograd.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))
        # momentum
        moments = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        grads_ema = [K.zeros([], dtype=K.floatx()) for _ in params]  # v^l_t

        if self.amsgrad:
            vhats = [K.zeros([], dtype=K.floatx()) for _ in grads_ema]
        else:
            vhats = [K.zeros([]) for _ in grads_ema]

        self.weights = [self.iterations] + moments + grads_ema + vhats

        for p, g, m, g_ema, vhat in zip(params, grads, moments, grads_ema, vhats):
            # compute ema for grads^2 for each layer
            g_2 = K.sum(K.square(x=K.cast(g, K.floatx())))
            g_ema_new = K.switch(K.equal(g_ema, 0.),
                                 g_2,
                                 g_ema * self.beta_2 + g_2 * (1.0 - self.beta_2))

            if self.amsgrad:
                g_ema_new = K.maximum(vhat, g_ema_new)
                self.updates.append(K.update(vhat, g_ema_new))

            g *= 1.0 / (K.sqrt(g_ema_new) + self.epsilon)

            # weight decay
            if self.weight_decay > 0.0:
                g += (self.weight_decay * p)

            # Momentum --> SAG
            if self.grad_averaging:
                g *= (1.0 - self.beta_1)

            m_t = self.beta_1 * m + g  # velocity

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(g_ema, g_ema_new))

            new_p = p - lr * m_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 39
Project: keras-contrib   Author: keras-team   File: padam.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                denom = (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                denom = (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            # Partial momentum adaption.
            new_p = p - (lr_t * (m_t / (denom ** (self.partial * 2))))

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 40
Project: keras-padam   Author: titu1994   File: padam.py    MIT License 4 votes vote down vote up
def __init__(self, lr=0.1, beta_1=0.9, beta_2=0.999,
                 epsilon=1e-8, decay=0., amsgrad=False,
                 partial=0.125, **kwargs):
        """ Partially adaptive momentum estimation method (Padam).

        # Arguments
            lr: float >= 0. Learning rate.
            beta_1: float, 0 < beta < 1. Generally close to 1.
            beta_2: float, 0 < beta < 1. Generally close to 1.
            epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
            decay: float >= 0. Learning rate decay over each update.
            amsgrad: boolean. Whether to apply the AMSGrad variant of this
                algorithm from the paper "On the Convergence of Adam and
                Beyond".
            partial: float >=0 and <= 0.5. Partially adaptive parameter, with a maximum
                value of 0.5. Values larger than 0.5 will result in non convergence.
                As `partial` tends to 0, Padam reduces to SGD, and as `partial` tends
                to 0.5, Padam reduces to AMSGrad. Default value is obtained empirically.

        # References
            - [Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks](https://arxiv.org/abs/1806.06763)
            - [Adam: A Method for Stochastic Optimization](https://arxiv.org/abs/1412.6980)
        """
        super(Padam, self).__init__(**kwargs)

        if partial < 0.0 or partial > 0.5:
            raise ValueError("`partial` must be a positive float between 0 and 0.5, "
                             "otherwise the optimizer may not converge.")

        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')

        if epsilon is None:
            epsilon = K.epsilon()
        self.epsilon = epsilon
        self.initial_decay = decay
        self.amsgrad = amsgrad
        self.partial = partial 
Example 41
Project: keras-padam   Author: titu1994   File: padam.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                denom = (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                denom = (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))

            # Partial momentum adaption
            new_p = p - (lr_t * (m_t / (denom ** (self.partial * 2))))

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))

        return self.updates 
Example 42
Project: faceswap   Author: deepfakes   File: optimizers.py    GNU General Public License v3.0 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        # Pass off to CPU if requested
        if self.cpu_mode:
            with K.tf.device("/cpu:0"):
                ms, vs, vhats = self.update_1(params)
        else:
            ms, vs, vhats = self.update_1(params)

        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 43
Project: StyleGAN-Keras   Author: manicman1999   File: adamlr.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        if self.amsgrad:
            vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        else:
            vhats = [K.zeros(1) for _ in params]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):

            # Learning rate multipliers
            if self.multipliers:
                multiplier = [mult for mult in self.multipliers if mult in p.name]
            else:
                multiplier = None
            if multiplier:
                new_lr_t = lr_t * self.multipliers[multiplier[0]]
                if self.debug_verbose:
                    print('Setting {} to learning rate {}'.format(multiplier[0], new_lr_t))
                    print(K.get_value(new_lr_t))
            else:
                new_lr_t = lr_t
                if self.debug_verbose:
                    print('No change in learning rate {}'.format(p.name))
                    print(K.get_value(new_lr_t))
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 44
Project: BERT_with_keras   Author: miroozyx   File: optimization.py    MIT License 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = tf.train.polynomial_decay(
            self.lr,
            self.iterations,
            self.num_train_steps,
            end_learning_rate=0.0,
            power=1.0,
            cycle=False
        )

        # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
        # learning rate will be `global_step/num_warmup_steps * init_lr`.
        t = K.cast(self.iterations, K.floatx()) + 1
        warmup_percent_done = K.cast(t / self.num_warmup_steps, dtype=K.floatx())
        warmup_lr = self.lr * warmup_percent_done
        is_warmup = K.cast(t < self.num_warmup_steps, dtype=K.floatx())
        lr = ((1.0 - is_warmup) * lr) + is_warmup * warmup_lr

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]

        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            if self.bias_corrected:
                m_t /= 1 - K.pow(self.beta_1, t)
                v_t /= 1 - K.pow(self.beta_2, t)

            update = m_t / (K.sqrt(v_t) + self.epsilon)

            # Just adding the square of the weights to the loss function is *not*
            # the correct way of using L2 regularization/weight decay with Adam,
            # since that will interact with the m and v parameters in strange ways.
            #
            # Instead we want ot decay the weights in a manner that doesn't interact
            # with the m/v parameters. This is equivalent to adding the square
            # of the weights to the loss with plain (non-momentum) SGD.
            param_name = self._get_variable_name(p.name)
            if self._do_use_weight_decay(param_name):
                update += self.weight_decay_rate * p

            p_t = p - lr * update

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates 
Example 45
Project: learning_to_adapt   Author: ondrejklejch   File: adamw.py    Apache License 2.0 4 votes vote down vote up
def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        '''Bias corrections according to the Adam paper
        '''
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            
            '''Schedule multiplier eta_t = 1 for simple AdamW
            According to the AdamW paper, eta_t can be fixed, decay, or 
            also be used for warm restarts (AdamWR to come). 
            '''
            eta_t = 1.
            p_t = p - eta_t*(lr_t * m_t / (K.sqrt(v_t) + self.epsilon))
            if self.weight_decay != 0:
                '''Normalized weight decay according to the AdamW paper
                '''
                w_d = self.weight_decay*K.sqrt(self.batch_size/(self.samples_per_epoch*self.epochs))
                p_t = p_t - eta_t*(w_d*p) 

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates