pytorch/torch/optim/adadelta.py
Edouard Delasalles e374dc1696 add step rate to adadelta (#568)
Scales `delta` before it is applied to the parameters in order to control the learning rate of the optimizer (inspired from climin optim lib for theano).
Also changed the link to the Adadelta paper to point to the right location.
2017-01-24 08:48:19 -05:00

65 lines
2.4 KiB
Python

from .optimizer import Optimizer
class Adadelta(Optimizer):
"""Implements Adadelta algorithm.
It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
rho (float, optional): coefficient used for computing a running average
of squared gradients (default: 0.9)
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-6)
step_rate (float, optional): coefficient that scale delta before it is applied to the
parameters (default: 1)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
.. _ADADELTA\: An Adaptive Learning Rate Method:
https://arxiv.org/abs/1212.5701
"""
def __init__(self, params, rho=0.9, eps=1e-6, step_rate=1, weight_decay=0):
defaults = dict(rho=rho, eps=eps, weight_decay=weight_decay, step_rate=step_rate)
super(Adadelta, self).__init__(params, defaults)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
grad = p.grad.data
state = self.state[id(p)]
# State initialization
if len(state) == 0:
state['step'] = 0
state['square_avg'] = grad.new().resize_as_(grad).zero_()
state['acc_delta'] = grad.new().resize_as_(grad).zero_()
square_avg, acc_delta = state['square_avg'], state['acc_delta']
rho, eps, step_rate = group['rho'], group['eps'], group['step_rate']
state['step'] += 1
if group['weight_decay'] != 0:
grad = grad.add(group['weight_decay'], p.data)
square_avg.mul_(rho).addcmul_(1 - rho, grad, grad)
std = square_avg.add(eps).sqrt_()
delta = acc_delta.add(eps).sqrt_().div_(std).mul_(grad).mul_(step_rate)
p.data.sub_(delta)
acc_delta.mul_(rho).addcmul_(1 - rho, delta, delta)
return loss