mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Revert D29241736: [pytorch][PR] To add Rectified Adam Algorithm to Optimizers
Test Plan: revert-hammer
Differential Revision:
D29241736 (0d2a936176)
Original commit changeset: 288b9b1f3125
fbshipit-source-id: 56c4ec98647c6f1822b130726741a1c9ca193670
This commit is contained in:
parent
99ca2c5b4b
commit
1abf45e37f
|
|
@ -132,7 +132,6 @@ Algorithms
|
|||
Adamax
|
||||
ASGD
|
||||
LBFGS
|
||||
RAdam
|
||||
RMSprop
|
||||
Rprop
|
||||
SGD
|
||||
|
|
|
|||
|
|
@ -25,17 +25,6 @@
|
|||
{"learningRate": 1e-4, "weightDecay": 0.1}
|
||||
]
|
||||
},
|
||||
{
|
||||
"algorithm": "radam",
|
||||
"config": [
|
||||
{},
|
||||
{"learningRate": 1e-4},
|
||||
{"learningRate": 1e-4, "beta1": 0.92},
|
||||
{"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96},
|
||||
{"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96, "epsilon": 1e-3},
|
||||
{"learningRate": 1e-4, "weightDecay": 0.1}
|
||||
]
|
||||
},
|
||||
{
|
||||
"algorithm": "adamw",
|
||||
"config": [
|
||||
|
|
|
|||
|
|
@ -533,29 +533,6 @@ class TestOptim(TestCase):
|
|||
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
|
||||
optimizer(None, lr=1e-2, betas=(0.0, 1.0))
|
||||
|
||||
def test_radam(self):
|
||||
self._test_basic_cases(
|
||||
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3)
|
||||
)
|
||||
self._test_basic_cases(
|
||||
lambda weight, bias: optim.RAdam(
|
||||
self._build_params_dict(weight, bias, lr=1e-2),
|
||||
lr=1e-3)
|
||||
)
|
||||
self._test_basic_cases(
|
||||
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3, weight_decay=0.1)
|
||||
)
|
||||
self._test_basic_cases(
|
||||
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3),
|
||||
[lambda opt: ExponentialLR(opt, gamma=0.9),
|
||||
lambda opt: ReduceLROnPlateau(opt)]
|
||||
)
|
||||
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
|
||||
optim.RAdam(None, lr=1e-2, betas=(1.0, 0.0))
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
|
||||
optim.RAdam(None, lr=1e-2, weight_decay=-1)
|
||||
|
||||
def test_rmsprop(self):
|
||||
for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
|
||||
self._test_basic_cases(
|
||||
|
|
|
|||
|
|
@ -13,7 +13,6 @@ from .sparse_adam import SparseAdam
|
|||
from .adamax import Adamax
|
||||
from .asgd import ASGD
|
||||
from .sgd import SGD
|
||||
from .radam import RAdam
|
||||
from .rprop import Rprop
|
||||
from .rmsprop import RMSprop
|
||||
from .optimizer import Optimizer
|
||||
|
|
@ -29,7 +28,6 @@ del sparse_adam
|
|||
del adamax
|
||||
del asgd
|
||||
del sgd
|
||||
del radam
|
||||
del rprop
|
||||
del rmsprop
|
||||
del optimizer
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ from .adamw import AdamW as AdamW
|
|||
from .asgd import ASGD as ASGD
|
||||
from .lbfgs import LBFGS as LBFGS
|
||||
from .optimizer import Optimizer as Optimizer
|
||||
from .radam import RAdam as RAdam
|
||||
from .rmsprop import RMSprop as RMSprop
|
||||
from .rprop import Rprop as Rprop
|
||||
from .sgd import SGD as SGD
|
||||
|
|
|
|||
|
|
@ -357,53 +357,3 @@ def asgd(params: List[Tensor],
|
|||
ax.add_(param.sub(ax).mul(mu))
|
||||
else:
|
||||
ax.copy_(param)
|
||||
|
||||
|
||||
def radam(params: List[Tensor],
|
||||
grads: List[Tensor],
|
||||
exp_avgs: List[Tensor],
|
||||
exp_avg_sqs: List[Tensor],
|
||||
state_steps: List[int],
|
||||
*,
|
||||
beta1: float,
|
||||
beta2: float,
|
||||
lr: float,
|
||||
weight_decay: float,
|
||||
eps: float):
|
||||
r"""Functional API that performs RAdam algorithm computation.
|
||||
|
||||
See :class:`~torch.optim.Adam` for details.
|
||||
"""
|
||||
|
||||
for i, param in enumerate(params):
|
||||
grad = grads[i]
|
||||
exp_avg = exp_avgs[i]
|
||||
exp_avg_sq = exp_avg_sqs[i]
|
||||
step = state_steps[i]
|
||||
|
||||
bias_correction1 = 1 - beta1 ** step
|
||||
bias_correction2 = 1 - beta2 ** step
|
||||
|
||||
if weight_decay != 0:
|
||||
grad = grad.add(param, alpha=weight_decay)
|
||||
|
||||
# Decay the first and second moment running average coefficient
|
||||
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
|
||||
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
|
||||
|
||||
# correcting bias for the first moving moment
|
||||
bias_corrected_exp_avg = exp_avg / bias_correction1
|
||||
|
||||
# maximum length of the approximated SMA
|
||||
rho_inf = 2 / (1 - beta2) - 1
|
||||
# compute the length of the approximated SMA
|
||||
rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
|
||||
|
||||
if rho_t > 5.:
|
||||
# Compute the variance rectification term and update parameters accordingly
|
||||
rect = math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
|
||||
adaptive_lr = math.sqrt(bias_correction2) / exp_avg_sq.sqrt().add_(eps)
|
||||
|
||||
param.add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha=-1.0)
|
||||
else:
|
||||
param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)
|
||||
|
|
|
|||
|
|
@ -1,92 +0,0 @@
|
|||
import torch
|
||||
from . import _functional as F
|
||||
from .optimizer import Optimizer
|
||||
|
||||
|
||||
class RAdam(Optimizer):
|
||||
r"""Implements RAdam algorithm.
|
||||
It has been proposed in `On the variance of the adaptive learning rate and beyond`_.
|
||||
Args:
|
||||
params (iterable): iterable of parameters to optimize or dicts defining
|
||||
parameter groups
|
||||
lr (float, optional): learning rate (default: 2e-3)
|
||||
betas (Tuple[float, float], optional): coefficients used for computing
|
||||
running averages of gradient and its square (default: (0.9, 0.999))
|
||||
eps (float, optional): term added to the denominator to improve
|
||||
numerical stability (default: 1e-8)
|
||||
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
|
||||
.. _On the variance of the adaptive learning rate and beyond:
|
||||
https://arxiv.org/pdf/1908.03265.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
|
||||
weight_decay=0):
|
||||
if not 0.0 <= lr:
|
||||
raise ValueError("Invalid learning rate: {}".format(lr))
|
||||
if not 0.0 <= eps:
|
||||
raise ValueError("Invalid epsilon value: {}".format(eps))
|
||||
if not 0.0 <= betas[0] < 1.0:
|
||||
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
|
||||
if not 0.0 <= betas[1] < 1.0:
|
||||
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
|
||||
if not 0.0 <= weight_decay:
|
||||
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
|
||||
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
|
||||
super(RAdam, self).__init__(params, defaults)
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self, closure=None):
|
||||
"""Performs a single optimization step.
|
||||
Args:
|
||||
closure (callable, optional): A closure that reevaluates the model
|
||||
and returns the loss.
|
||||
"""
|
||||
loss = None
|
||||
if closure is not None:
|
||||
with torch.enable_grad():
|
||||
loss = closure()
|
||||
|
||||
for group in self.param_groups:
|
||||
params_with_grad = []
|
||||
grads = []
|
||||
exp_avgs = []
|
||||
exp_avg_sqs = []
|
||||
max_exp_avg_sqs = []
|
||||
state_steps = []
|
||||
beta1, beta2 = group['betas']
|
||||
|
||||
for p in group['params']:
|
||||
if p.grad is not None:
|
||||
params_with_grad.append(p)
|
||||
if p.grad.is_sparse:
|
||||
raise RuntimeError('RAdam does not support sparse gradients')
|
||||
grads.append(p.grad)
|
||||
|
||||
state = self.state[p]
|
||||
# Lazy state initialization
|
||||
if len(state) == 0:
|
||||
state['step'] = 0
|
||||
# Exponential moving average of gradient values
|
||||
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
||||
# Exponential moving average of squared gradient values
|
||||
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
|
||||
|
||||
exp_avgs.append(state['exp_avg'])
|
||||
exp_avg_sqs.append(state['exp_avg_sq'])
|
||||
|
||||
# update the steps for each param group update
|
||||
state['step'] += 1
|
||||
# record the step after step update
|
||||
state_steps.append(state['step'])
|
||||
|
||||
F.radam(params_with_grad,
|
||||
grads,
|
||||
exp_avgs,
|
||||
exp_avg_sqs,
|
||||
state_steps,
|
||||
beta1=beta1,
|
||||
beta2=beta2,
|
||||
lr=group['lr'],
|
||||
weight_decay=group['weight_decay'],
|
||||
eps=group['eps'])
|
||||
return loss
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
from typing import Tuple
|
||||
from .optimizer import _params_t, Optimizer
|
||||
|
||||
class RAdam(Optimizer):
|
||||
def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...
|
||||
Loading…
Reference in New Issue
Block a user