"""Some standard gradient-based stochastic optimizers. These are just standard routines that don't make any use of autograd, though you could take gradients of these functions too if you want to do meta-optimization. These routines can optimize functions whose inputs are structured objects, such as dicts of numpy arrays.""" from __future__ import absolute_import from builtins import range import autograd.numpy as np from autograd.misc import flatten from autograd.wrap_util import wraps def unflatten_optimizer(optimize): """Takes an optimizer that operates on flat 1D numpy arrays and returns a wrapped version that handles trees of nested containers (lists/tuples/dicts) with arrays/scalars at the leaves.""" @wraps(optimize) def _optimize(grad, x0, callback=None, *args, **kwargs): _x0, unflatten = flatten(x0) _grad = lambda x, i: flatten(grad(unflatten(x), i))[0] if callback: _callback = lambda x, i, g: callback(unflatten(x), i, unflatten(g)) else: _callback = None return unflatten(optimize(_grad, _x0, _callback, *args, **kwargs)) return _optimize @unflatten_optimizer def sgd(grad, x, callback=None, num_iters=200, step_size=0.1, mass=0.9): """Stochastic gradient descent with momentum. grad() must have signature grad(x, i), where i is the iteration number.""" velocity = np.zeros(len(x)) for i in range(num_iters): g = grad(x, i) if callback: callback(x, i, g) velocity = mass * velocity - (1.0 - mass) * g x = x + step_size * velocity return x @unflatten_optimizer def rmsprop(grad, x, callback=None, num_iters=100, step_size=0.1, gamma=0.9, eps=10**-8): """Root mean squared prop: See Adagrad paper for details.""" avg_sq_grad = np.ones(len(x)) for i in range(num_iters): g = grad(x, i) if callback: callback(x, i, g) avg_sq_grad = avg_sq_grad * gamma + g**2 * (1 - gamma) x = x - step_size * g/(np.sqrt(avg_sq_grad) + eps) return x @unflatten_optimizer def adam(grad, x, callback=None, num_iters=100, step_size=0.001, b1=0.9, b2=0.999, eps=10**-8): """Adam as described in http://arxiv.org/pdf/1412.6980.pdf. It's basically RMSprop with momentum and some correction terms.""" m = np.zeros(len(x)) v = np.zeros(len(x)) for i in range(num_iters): g = grad(x, i) if callback: callback(x, i, g) m = (1 - b1) * g + b1 * m # First moment estimate. v = (1 - b2) * (g**2) + b2 * v # Second moment estimate. mhat = m / (1 - b1**(i + 1)) # Bias correction. vhat = v / (1 - b2**(i + 1)) x = x - step_size*mhat/(np.sqrt(vhat) + eps) return x