mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Revert D29241736: [pytorch][PR] To add Rectified Adam Algorithm to Optimizers
Test Plan: revert-hammer
Differential Revision:
D29241736 (0d2a936176)
Original commit changeset: 288b9b1f3125
fbshipit-source-id: 56c4ec98647c6f1822b130726741a1c9ca193670
This commit is contained in:
parent
99ca2c5b4b
commit
1abf45e37f
|
|
@ -132,7 +132,6 @@ Algorithms
|
||||||
Adamax
|
Adamax
|
||||||
ASGD
|
ASGD
|
||||||
LBFGS
|
LBFGS
|
||||||
RAdam
|
|
||||||
RMSprop
|
RMSprop
|
||||||
Rprop
|
Rprop
|
||||||
SGD
|
SGD
|
||||||
|
|
|
||||||
|
|
@ -25,17 +25,6 @@
|
||||||
{"learningRate": 1e-4, "weightDecay": 0.1}
|
{"learningRate": 1e-4, "weightDecay": 0.1}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"algorithm": "radam",
|
|
||||||
"config": [
|
|
||||||
{},
|
|
||||||
{"learningRate": 1e-4},
|
|
||||||
{"learningRate": 1e-4, "beta1": 0.92},
|
|
||||||
{"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96},
|
|
||||||
{"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96, "epsilon": 1e-3},
|
|
||||||
{"learningRate": 1e-4, "weightDecay": 0.1}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"algorithm": "adamw",
|
"algorithm": "adamw",
|
||||||
"config": [
|
"config": [
|
||||||
|
|
|
||||||
|
|
@ -533,29 +533,6 @@ class TestOptim(TestCase):
|
||||||
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
|
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
|
||||||
optimizer(None, lr=1e-2, betas=(0.0, 1.0))
|
optimizer(None, lr=1e-2, betas=(0.0, 1.0))
|
||||||
|
|
||||||
def test_radam(self):
|
|
||||||
self._test_basic_cases(
|
|
||||||
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3)
|
|
||||||
)
|
|
||||||
self._test_basic_cases(
|
|
||||||
lambda weight, bias: optim.RAdam(
|
|
||||||
self._build_params_dict(weight, bias, lr=1e-2),
|
|
||||||
lr=1e-3)
|
|
||||||
)
|
|
||||||
self._test_basic_cases(
|
|
||||||
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3, weight_decay=0.1)
|
|
||||||
)
|
|
||||||
self._test_basic_cases(
|
|
||||||
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3),
|
|
||||||
[lambda opt: ExponentialLR(opt, gamma=0.9),
|
|
||||||
lambda opt: ReduceLROnPlateau(opt)]
|
|
||||||
)
|
|
||||||
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
|
|
||||||
optim.RAdam(None, lr=1e-2, betas=(1.0, 0.0))
|
|
||||||
|
|
||||||
with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
|
|
||||||
optim.RAdam(None, lr=1e-2, weight_decay=-1)
|
|
||||||
|
|
||||||
def test_rmsprop(self):
|
def test_rmsprop(self):
|
||||||
for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
|
for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
|
||||||
self._test_basic_cases(
|
self._test_basic_cases(
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,6 @@ from .sparse_adam import SparseAdam
|
||||||
from .adamax import Adamax
|
from .adamax import Adamax
|
||||||
from .asgd import ASGD
|
from .asgd import ASGD
|
||||||
from .sgd import SGD
|
from .sgd import SGD
|
||||||
from .radam import RAdam
|
|
||||||
from .rprop import Rprop
|
from .rprop import Rprop
|
||||||
from .rmsprop import RMSprop
|
from .rmsprop import RMSprop
|
||||||
from .optimizer import Optimizer
|
from .optimizer import Optimizer
|
||||||
|
|
@ -29,7 +28,6 @@ del sparse_adam
|
||||||
del adamax
|
del adamax
|
||||||
del asgd
|
del asgd
|
||||||
del sgd
|
del sgd
|
||||||
del radam
|
|
||||||
del rprop
|
del rprop
|
||||||
del rmsprop
|
del rmsprop
|
||||||
del optimizer
|
del optimizer
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,6 @@ from .adamw import AdamW as AdamW
|
||||||
from .asgd import ASGD as ASGD
|
from .asgd import ASGD as ASGD
|
||||||
from .lbfgs import LBFGS as LBFGS
|
from .lbfgs import LBFGS as LBFGS
|
||||||
from .optimizer import Optimizer as Optimizer
|
from .optimizer import Optimizer as Optimizer
|
||||||
from .radam import RAdam as RAdam
|
|
||||||
from .rmsprop import RMSprop as RMSprop
|
from .rmsprop import RMSprop as RMSprop
|
||||||
from .rprop import Rprop as Rprop
|
from .rprop import Rprop as Rprop
|
||||||
from .sgd import SGD as SGD
|
from .sgd import SGD as SGD
|
||||||
|
|
|
||||||
|
|
@ -357,53 +357,3 @@ def asgd(params: List[Tensor],
|
||||||
ax.add_(param.sub(ax).mul(mu))
|
ax.add_(param.sub(ax).mul(mu))
|
||||||
else:
|
else:
|
||||||
ax.copy_(param)
|
ax.copy_(param)
|
||||||
|
|
||||||
|
|
||||||
def radam(params: List[Tensor],
|
|
||||||
grads: List[Tensor],
|
|
||||||
exp_avgs: List[Tensor],
|
|
||||||
exp_avg_sqs: List[Tensor],
|
|
||||||
state_steps: List[int],
|
|
||||||
*,
|
|
||||||
beta1: float,
|
|
||||||
beta2: float,
|
|
||||||
lr: float,
|
|
||||||
weight_decay: float,
|
|
||||||
eps: float):
|
|
||||||
r"""Functional API that performs RAdam algorithm computation.
|
|
||||||
|
|
||||||
See :class:`~torch.optim.Adam` for details.
|
|
||||||
"""
|
|
||||||
|
|
||||||
for i, param in enumerate(params):
|
|
||||||
grad = grads[i]
|
|
||||||
exp_avg = exp_avgs[i]
|
|
||||||
exp_avg_sq = exp_avg_sqs[i]
|
|
||||||
step = state_steps[i]
|
|
||||||
|
|
||||||
bias_correction1 = 1 - beta1 ** step
|
|
||||||
bias_correction2 = 1 - beta2 ** step
|
|
||||||
|
|
||||||
if weight_decay != 0:
|
|
||||||
grad = grad.add(param, alpha=weight_decay)
|
|
||||||
|
|
||||||
# Decay the first and second moment running average coefficient
|
|
||||||
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
|
|
||||||
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
|
|
||||||
|
|
||||||
# correcting bias for the first moving moment
|
|
||||||
bias_corrected_exp_avg = exp_avg / bias_correction1
|
|
||||||
|
|
||||||
# maximum length of the approximated SMA
|
|
||||||
rho_inf = 2 / (1 - beta2) - 1
|
|
||||||
# compute the length of the approximated SMA
|
|
||||||
rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
|
|
||||||
|
|
||||||
if rho_t > 5.:
|
|
||||||
# Compute the variance rectification term and update parameters accordingly
|
|
||||||
rect = math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
|
|
||||||
adaptive_lr = math.sqrt(bias_correction2) / exp_avg_sq.sqrt().add_(eps)
|
|
||||||
|
|
||||||
param.add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha=-1.0)
|
|
||||||
else:
|
|
||||||
param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
|
|
||||||
|
|
|
||||||
|
|
@ -1,92 +0,0 @@
|
||||||
import torch
|
|
||||||
from . import _functional as F
|
|
||||||
from .optimizer import Optimizer
|
|
||||||
|
|
||||||
|
|
||||||
class RAdam(Optimizer):
|
|
||||||
r"""Implements RAdam algorithm.
|
|
||||||
It has been proposed in `On the variance of the adaptive learning rate and beyond`_.
|
|
||||||
Args:
|
|
||||||
params (iterable): iterable of parameters to optimize or dicts defining
|
|
||||||
parameter groups
|
|
||||||
lr (float, optional): learning rate (default: 2e-3)
|
|
||||||
betas (Tuple[float, float], optional): coefficients used for computing
|
|
||||||
running averages of gradient and its square (default: (0.9, 0.999))
|
|
||||||
eps (float, optional): term added to the denominator to improve
|
|
||||||
numerical stability (default: 1e-8)
|
|
||||||
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
|
|
||||||
.. _On the variance of the adaptive learning rate and beyond:
|
|
||||||
https://arxiv.org/pdf/1908.03265.pdf
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
|
|
||||||
weight_decay=0):
|
|
||||||
if not 0.0 <= lr:
|
|
||||||
raise ValueError("Invalid learning rate: {}".format(lr))
|
|
||||||
if not 0.0 <= eps:
|
|
||||||
raise ValueError("Invalid epsilon value: {}".format(eps))
|
|
||||||
if not 0.0 <= betas[0] < 1.0:
|
|
||||||
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
|
|
||||||
if not 0.0 <= betas[1] < 1.0:
|
|
||||||
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
|
|
||||||
if not 0.0 <= weight_decay:
|
|
||||||
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
|
|
||||||
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
|
|
||||||
super(RAdam, self).__init__(params, defaults)
|
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def step(self, closure=None):
|
|
||||||
"""Performs a single optimization step.
|
|
||||||
Args:
|
|
||||||
closure (callable, optional): A closure that reevaluates the model
|
|
||||||
and returns the loss.
|
|
||||||
"""
|
|
||||||
loss = None
|
|
||||||
if closure is not None:
|
|
||||||
with torch.enable_grad():
|
|
||||||
loss = closure()
|
|
||||||
|
|
||||||
for group in self.param_groups:
|
|
||||||
params_with_grad = []
|
|
||||||
grads = []
|
|
||||||
exp_avgs = []
|
|
||||||
exp_avg_sqs = []
|
|
||||||
max_exp_avg_sqs = []
|
|
||||||
state_steps = []
|
|
||||||
beta1, beta2 = group['betas']
|
|
||||||
|
|
||||||
for p in group['params']:
|
|
||||||
if p.grad is not None:
|
|
||||||
params_with_grad.append(p)
|
|
||||||
if p.grad.is_sparse:
|
|
||||||
raise RuntimeError('RAdam does not support sparse gradients')
|
|
||||||
grads.append(p.grad)
|
|
||||||
|
|
||||||
state = self.state[p]
|
|
||||||
# Lazy state initialization
|
|
||||||
if len(state) == 0:
|
|
||||||
state['step'] = 0
|
|
||||||
# Exponential moving average of gradient values
|
|
||||||
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
|
||||||
# Exponential moving average of squared gradient values
|
|
||||||
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
|
||||||
|
|
||||||
exp_avgs.append(state['exp_avg'])
|
|
||||||
exp_avg_sqs.append(state['exp_avg_sq'])
|
|
||||||
|
|
||||||
# update the steps for each param group update
|
|
||||||
state['step'] += 1
|
|
||||||
# record the step after step update
|
|
||||||
state_steps.append(state['step'])
|
|
||||||
|
|
||||||
F.radam(params_with_grad,
|
|
||||||
grads,
|
|
||||||
exp_avgs,
|
|
||||||
exp_avg_sqs,
|
|
||||||
state_steps,
|
|
||||||
beta1=beta1,
|
|
||||||
beta2=beta2,
|
|
||||||
lr=group['lr'],
|
|
||||||
weight_decay=group['weight_decay'],
|
|
||||||
eps=group['eps'])
|
|
||||||
return loss
|
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
from typing import Tuple
|
|
||||||
from .optimizer import _params_t, Optimizer
|
|
||||||
|
|
||||||
class RAdam(Optimizer):
|
|
||||||
def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...
|
|
||||||
Loading…
Reference in New Issue
Block a user