Revert D29241736: [pytorch][PR] To add Rectified Adam Algorithm to Optimizers

Test Plan: revert-hammer

Differential Revision:
D29241736 (0d2a936176)

Original commit changeset: 288b9b1f3125

fbshipit-source-id: 56c4ec98647c6f1822b130726741a1c9ca193670
This commit is contained in:
Sam Estep 2021-06-22 12:02:57 -07:00 committed by Facebook GitHub Bot
parent 99ca2c5b4b
commit 1abf45e37f
8 changed files with 0 additions and 185 deletions

View File

@ -132,7 +132,6 @@ Algorithms
Adamax
ASGD
LBFGS
RAdam
RMSprop
Rprop
SGD

View File

@ -25,17 +25,6 @@
{"learningRate": 1e-4, "weightDecay": 0.1}
]
},
{
"algorithm": "radam",
"config": [
{},
{"learningRate": 1e-4},
{"learningRate": 1e-4, "beta1": 0.92},
{"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96},
{"learningRate": 1e-4, "beta1": 0.92, "beta2": 0.96, "epsilon": 1e-3},
{"learningRate": 1e-4, "weightDecay": 0.1}
]
},
{
"algorithm": "adamw",
"config": [

View File

@ -533,29 +533,6 @@ class TestOptim(TestCase):
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 1: 1.0"):
optimizer(None, lr=1e-2, betas=(0.0, 1.0))
def test_radam(self):
self._test_basic_cases(
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.RAdam(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3)
)
self._test_basic_cases(
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3, weight_decay=0.1)
)
self._test_basic_cases(
lambda weight, bias: optim.RAdam([weight, bias], lr=1e-3),
[lambda opt: ExponentialLR(opt, gamma=0.9),
lambda opt: ReduceLROnPlateau(opt)]
)
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
optim.RAdam(None, lr=1e-2, betas=(1.0, 0.0))
with self.assertRaisesRegex(ValueError, "Invalid weight_decay value: -1"):
optim.RAdam(None, lr=1e-2, weight_decay=-1)
def test_rmsprop(self):
for optimizer in [optim.RMSprop, optim_mt.RMSprop]:
self._test_basic_cases(

View File

@ -13,7 +13,6 @@ from .sparse_adam import SparseAdam
from .adamax import Adamax
from .asgd import ASGD
from .sgd import SGD
from .radam import RAdam
from .rprop import Rprop
from .rmsprop import RMSprop
from .optimizer import Optimizer
@ -29,7 +28,6 @@ del sparse_adam
del adamax
del asgd
del sgd
del radam
del rprop
del rmsprop
del optimizer

View File

@ -8,7 +8,6 @@ from .adamw import AdamW as AdamW
from .asgd import ASGD as ASGD
from .lbfgs import LBFGS as LBFGS
from .optimizer import Optimizer as Optimizer
from .radam import RAdam as RAdam
from .rmsprop import RMSprop as RMSprop
from .rprop import Rprop as Rprop
from .sgd import SGD as SGD

View File

@ -357,53 +357,3 @@ def asgd(params: List[Tensor],
ax.add_(param.sub(ax).mul(mu))
else:
ax.copy_(param)
def radam(params: List[Tensor],
grads: List[Tensor],
exp_avgs: List[Tensor],
exp_avg_sqs: List[Tensor],
state_steps: List[int],
*,
beta1: float,
beta2: float,
lr: float,
weight_decay: float,
eps: float):
r"""Functional API that performs RAdam algorithm computation.
See :class:`~torch.optim.Adam` for details.
"""
for i, param in enumerate(params):
grad = grads[i]
exp_avg = exp_avgs[i]
exp_avg_sq = exp_avg_sqs[i]
step = state_steps[i]
bias_correction1 = 1 - beta1 ** step
bias_correction2 = 1 - beta2 ** step
if weight_decay != 0:
grad = grad.add(param, alpha=weight_decay)
# Decay the first and second moment running average coefficient
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
# correcting bias for the first moving moment
bias_corrected_exp_avg = exp_avg / bias_correction1
# maximum length of the approximated SMA
rho_inf = 2 / (1 - beta2) - 1
# compute the length of the approximated SMA
rho_t = rho_inf - 2 * step * (beta2 ** step) / bias_correction2
if rho_t > 5.:
# Compute the variance rectification term and update parameters accordingly
rect = math.sqrt((rho_t - 4) * (rho_t - 2) * rho_inf / ((rho_inf - 4) * (rho_inf - 2) * rho_t))
adaptive_lr = math.sqrt(bias_correction2) / exp_avg_sq.sqrt().add_(eps)
param.add_(bias_corrected_exp_avg * lr * adaptive_lr * rect, alpha=-1.0)
else:
param.add_(bias_corrected_exp_avg * lr, alpha=-1.0)

View File

@ -1,92 +0,0 @@
import torch
from . import _functional as F
from .optimizer import Optimizer
class RAdam(Optimizer):
r"""Implements RAdam algorithm.
It has been proposed in `On the variance of the adaptive learning rate and beyond`_.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 2e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
.. _On the variance of the adaptive learning rate and beyond:
https://arxiv.org/pdf/1908.03265.pdf
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=0):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super(RAdam, self).__init__(params, defaults)
@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.
Args:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group in self.param_groups:
params_with_grad = []
grads = []
exp_avgs = []
exp_avg_sqs = []
max_exp_avg_sqs = []
state_steps = []
beta1, beta2 = group['betas']
for p in group['params']:
if p.grad is not None:
params_with_grad.append(p)
if p.grad.is_sparse:
raise RuntimeError('RAdam does not support sparse gradients')
grads.append(p.grad)
state = self.state[p]
# Lazy state initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
exp_avgs.append(state['exp_avg'])
exp_avg_sqs.append(state['exp_avg_sq'])
# update the steps for each param group update
state['step'] += 1
# record the step after step update
state_steps.append(state['step'])
F.radam(params_with_grad,
grads,
exp_avgs,
exp_avg_sqs,
state_steps,
beta1=beta1,
beta2=beta2,
lr=group['lr'],
weight_decay=group['weight_decay'],
eps=group['eps'])
return loss

View File

@ -1,5 +0,0 @@
from typing import Tuple
from .optimizer import _params_t, Optimizer
class RAdam(Optimizer):
def __init__(self, params: _params_t, lr: float=..., betas: Tuple[float, float]=..., eps: float=..., weight_decay: float=...) -> None: ...