mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/62058 This is the second diff in this stack. This diff includes the changes to DPER3; the first diff includes the changes to Caffe2. We want to decay learning parameters properly. Previously this was not done when a parameter is absent from a minibatch. We fix this by keeping track of missed minibatches and making decay catch up accordingly. The exponential moving averages (EMA) for the first and second moments used in Adam are updated only for parameters seen in a minibatch. Actually, for these parameters, 0 should be added to the EMAs and the EMAs should then be decayed by multiplying by beta1 and beta2 respectively. To avoid the computational overhead of touching every parameter for every minibatch, we: * keep track of the last time a parameter is seen * instead of decaying the EMAs by multiplying by beta1 and beta2, we multiply by beta1^k and beta2^k, where k is the number of minibatches since the parameter was last seen. We hope this will significantly improve the inconsistent learning parameter issue we have seen with Adam. Differential Revision: D29638897 fbshipit-source-id: 18d8e227d72c2e23010ca81e0f6eeb78872c8d3c
2263 lines
77 KiB
Python
2263 lines
77 KiB
Python
# @package optimizer
|
|
# Module caffe2.python.optimizer
|
|
|
|
|
|
import copy
|
|
import logging
|
|
from collections import defaultdict, namedtuple
|
|
|
|
import numpy as np
|
|
from caffe2.proto import caffe2_pb2
|
|
from caffe2.python import core, scope, utils, workspace
|
|
from caffe2.python.modeling import parameter_info
|
|
from past.builtins import basestring
|
|
|
|
|
|
_LEARNING_RATE_INJECTION = "lr_injection"
|
|
|
|
AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
|
|
_optimizer_instance_count = defaultdict(int)
|
|
|
|
FP16_ENGINES = ["SIMD_Q_FP16", "SIMD_Q_STOC_FP16", "SIMD_Q_STOC_MKL_FP16"]
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def reset_optimizer_instance_count():
|
|
"""
|
|
This function clears the _optimizer_instance_count. And keeps it
|
|
empty. This functionality is needed in some situations where
|
|
optimizer instance count might not reset even though the workplace is reset.
|
|
"""
|
|
_optimizer_instance_count.clear()
|
|
|
|
|
|
class Optimizer(object):
|
|
def __init__(self):
|
|
self._aux_params = AuxOptimizerParams(local=[], shared=[])
|
|
self._instance_num = _optimizer_instance_count[self.__class__.__name__]
|
|
_optimizer_instance_count[self.__class__.__name__] += 1
|
|
self._lr_multiplier = None
|
|
self._local_lr_multiplier = None
|
|
self._local_lr_multiplier_on_gpu = False
|
|
|
|
"""
|
|
Adds optimization operators to the net for given parameter and its gradient
|
|
Parameter is specified by either 'param' being a ParameterInfo object.
|
|
In this case param.grad has to be set
|
|
|
|
Or by 'param' being a BlobReference and 'grad' being a BlobReference for its
|
|
gradient.
|
|
"""
|
|
|
|
def __call__(self, net, param_init_net, param, grad=None):
|
|
if grad is None:
|
|
assert isinstance(
|
|
param, parameter_info.ParameterInfo
|
|
), "Expected parameter to be of type ParameterInfo, got {}".format(param)
|
|
assert param.grad is not None
|
|
else:
|
|
if isinstance(param, basestring):
|
|
param = core.BlobReference(param)
|
|
param = parameter_info.ParameterInfo(param_id=None, param=param, grad=grad)
|
|
|
|
self._run(net, param_init_net, param)
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
raise Exception("Not Implemented")
|
|
|
|
def get_cpu_blob_name(self, base_str, node_name=""):
|
|
classname = self.__class__.__name__
|
|
return "%s_%d_%s%s_cpu" % (classname, self._instance_num, base_str, node_name)
|
|
|
|
def get_gpu_blob_name(self, base_str, gpu_id, node_name):
|
|
classname = self.__class__.__name__
|
|
return "%s_%d_%s%s_gpu%d" % (
|
|
classname,
|
|
self._instance_num,
|
|
base_str,
|
|
node_name,
|
|
gpu_id,
|
|
)
|
|
|
|
@property
|
|
def attributes(self):
|
|
# return a dict that contains attributes related to init args only
|
|
attr = copy.deepcopy(self.__dict__)
|
|
del attr["_instance_num"]
|
|
return attr
|
|
|
|
def make_unique_blob_name(self, base_str):
|
|
"""
|
|
Returns a blob name that will be unique to the current device
|
|
and optimizer instance.
|
|
"""
|
|
current_scope = scope.CurrentDeviceScope()
|
|
if current_scope is None:
|
|
return self.get_cpu_blob_name(base_str)
|
|
|
|
if core.IsGPUDeviceType(current_scope.device_type):
|
|
return self.get_gpu_blob_name(
|
|
base_str, current_scope.device_id, current_scope.node_name
|
|
)
|
|
else:
|
|
return self.get_cpu_blob_name(base_str, current_scope.node_name)
|
|
|
|
def build_lr(
|
|
self,
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate,
|
|
learning_rate_blob=None,
|
|
policy="fixed",
|
|
iter_val=0,
|
|
**kwargs
|
|
):
|
|
if learning_rate_blob is None:
|
|
learning_rate_blob = self.make_unique_blob_name("lr")
|
|
|
|
iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=iter_val)
|
|
|
|
if not net.BlobIsDefined(learning_rate_blob):
|
|
# There is one interesting thing here: since we are minimizing, we are
|
|
# doing "descent" so the learning rate is set to be negative.
|
|
lr = net.LearningRate(
|
|
[iteration],
|
|
learning_rate_blob,
|
|
base_lr=-base_learning_rate,
|
|
policy=policy,
|
|
**kwargs
|
|
)
|
|
else:
|
|
lr = net.GetBlobRef(learning_rate_blob)
|
|
|
|
if self._lr_multiplier is not None:
|
|
lr_multiplier = net.CopyFromCPUInput(
|
|
self._lr_multiplier, self.make_unique_blob_name("lr_multiplier")
|
|
)
|
|
|
|
lr = net.Mul(
|
|
[lr, lr_multiplier],
|
|
self.make_unique_blob_name("scaled_lr"),
|
|
broadcast=1,
|
|
)
|
|
|
|
if self._local_lr_multiplier is not None:
|
|
current_scope = scope.CurrentDeviceScope()
|
|
if (
|
|
current_scope is not None
|
|
and core.IsGPUDeviceType(current_scope.device_type)
|
|
and not self._local_lr_multiplier_on_gpu
|
|
):
|
|
local_lr_multiplier = net.CopyFromCPUInput(
|
|
self._local_lr_multiplier,
|
|
self.make_unique_blob_name("local_lr_multiplier"),
|
|
)
|
|
else:
|
|
local_lr_multiplier = self._local_lr_multiplier
|
|
|
|
lr = net.Mul(
|
|
[lr, local_lr_multiplier],
|
|
self.make_unique_blob_name("local_scaled_lr"),
|
|
broadcast=1,
|
|
)
|
|
|
|
return lr, iteration
|
|
|
|
def add_lr_multiplier(self, lr_multiplier):
|
|
"""
|
|
Set the global learning rate multiplier. If a multiplier already
|
|
existed, this will overwrite the existing multiplier. The multiplier is
|
|
used for all future calls to _run(), unless it is overwritten.
|
|
"""
|
|
self._lr_multiplier = lr_multiplier
|
|
|
|
def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False):
|
|
"""
|
|
Set the local learning rate multiplier. This local multiplier is
|
|
multiplied with the global learning rate multiplier if it exists. As
|
|
with the global learning rate multiplier, this multiplier will be
|
|
used for all future calls to _run(), so please call
|
|
_clear_local_lr_multiplier() at the beginning of the optimizer's _run()
|
|
before optionally calling this function.
|
|
"""
|
|
self._local_lr_multiplier = local_lr_multiplier
|
|
self._local_lr_multiplier_on_gpu = is_gpu_blob
|
|
|
|
def _clear_local_lr_multiplier(self):
|
|
self._local_lr_multiplier = None
|
|
self._local_lr_multiplier_on_gpu = False
|
|
|
|
@staticmethod
|
|
def dedup(net, sparse_dedup_aggregator, grad):
|
|
assert isinstance(
|
|
grad, core.GradientSlice
|
|
), "Dedup only works for sparse gradient, got {}".format(grad)
|
|
if sparse_dedup_aggregator:
|
|
return net.DeduplicateGradientSlices(
|
|
grad, aggregator=sparse_dedup_aggregator
|
|
)
|
|
else:
|
|
return grad
|
|
|
|
def get_auxiliary_parameters(self):
|
|
"""Returns a list of auxiliary parameters.
|
|
|
|
Returns:
|
|
aux_params: A namedtuple, AuxParams.
|
|
|
|
aux_params.local stores a list of blobs. Each blob is a local
|
|
auxiliary parameter. A local auxiliary parameter is a parameter in
|
|
parallel to a learning rate parameter. Take adagrad as an example,
|
|
the local auxiliary parameter is the squared sum parameter, because
|
|
every learning rate has a squared sum associated with it.
|
|
|
|
aux_params.shared also stores a list of blobs. Each blob is a shared
|
|
auxiliary parameter. A shared auxiliary parameter is a parameter
|
|
that is shared across all the learning rate parameters. Take adam as
|
|
an example, the iteration parameter is a shared parameter, because
|
|
all the learning rates share the same iteration parameter.
|
|
"""
|
|
return self._aux_params
|
|
|
|
# TODO(xlwang): In transfer learning, parameter initialized from pretrained
|
|
# model might require a different learning rate than otherwise initialized.
|
|
# To this end, here we implement a python solution where
|
|
# `base_learning_rate` is scaled by `scale`, by calling
|
|
# `scale_learning_rate`; Alternatively, we can achieve same effect by
|
|
# rewriting the LearningRate operator in C++
|
|
# Note that it is the responsibility of specific optimizer to decide what
|
|
# logic should be used for `scale_learning_rate`
|
|
def scale_learning_rate(self, *args, **kwargs):
|
|
raise NotImplementedError(
|
|
"Optimizer Need to Implement `scale_learning_rate` method."
|
|
)
|
|
|
|
def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
|
|
wd = param_init_net.ConstantFill(
|
|
[], "weight_decay", shape=[1], value=weight_decay
|
|
)
|
|
trust = param_init_net.ConstantFill([], "trust", shape=[1], value=trust)
|
|
lr_max = param_init_net.ConstantFill([], "lr_max", shape=[1], value=lr_max)
|
|
return wd, trust, lr_max
|
|
|
|
|
|
class SgdOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
base_learning_rate=0.01,
|
|
policy="fixed",
|
|
momentum=0.0,
|
|
nesterov=True,
|
|
sparse_dedup_aggregator=None,
|
|
lars=None,
|
|
**kwargs
|
|
):
|
|
super(SgdOptimizer, self).__init__()
|
|
self.base_learning_rate = base_learning_rate
|
|
self.policy = policy
|
|
self.momentum = momentum
|
|
self.nesterov = nesterov
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.lars = lars
|
|
self.init_kwargs = kwargs
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
if self.base_learning_rate == 0:
|
|
return
|
|
assert (
|
|
self.base_learning_rate > 0
|
|
), "Expect positive base learning rate, got {}".format(self.base_learning_rate)
|
|
|
|
self._clear_local_lr_multiplier()
|
|
|
|
# TODO(zqq): support LARS for sparse parameters
|
|
if self.lars is not None and not isinstance(grad, core.GradientSlice):
|
|
assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
|
|
self.lars
|
|
)
|
|
wd, trust, lr_max = self.create_lars_inputs(
|
|
param_init_net, 0.0, 1.0, np.finfo(np.float32).max
|
|
)
|
|
lr_lars_multiplier = net.Lars(
|
|
[param, grad, wd, trust, lr_max],
|
|
self.make_unique_blob_name(str(param) + "_lars"),
|
|
offset=self.lars,
|
|
lr_min=0.0,
|
|
)
|
|
current_scope = scope.CurrentDeviceScope()
|
|
self._add_local_lr_multiplier(
|
|
lr_lars_multiplier,
|
|
is_gpu_blob=(
|
|
current_scope is not None
|
|
and core.IsGPUDeviceType(current_scope.device_type)
|
|
),
|
|
)
|
|
|
|
# We need negative sign for LR when used directly with WeightedSum
|
|
# below.
|
|
lr_sign = -1 if self.momentum else 1
|
|
lr, _ = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=self.base_learning_rate * lr_sign,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
dev = scope.CurrentDeviceScope()
|
|
if dev is None:
|
|
dev = core.DeviceOption(caffe2_pb2.CPU)
|
|
|
|
# Each GPU/CPU must have its own ONE blob, thus modify the name
|
|
# to include device information.
|
|
ONE = param_init_net.ConstantFill(
|
|
[],
|
|
"ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
|
|
shape=[1],
|
|
value=1.0,
|
|
)
|
|
|
|
self._aux_params.shared.append(ONE)
|
|
|
|
if self.momentum > 0:
|
|
momentum_data = param_init_net.ConstantFill(
|
|
param, str(param) + "_momentum", value=0.0
|
|
)
|
|
self._aux_params.local.append(momentum_data)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
if self.momentum > 0.0:
|
|
net.SparseMomentumSGDUpdate(
|
|
[grad.values, momentum_data, lr, param, grad.indices],
|
|
[grad.values, momentum_data, param],
|
|
momentum=self.momentum,
|
|
nesterov=self.nesterov,
|
|
)
|
|
else:
|
|
net.ScatterWeightedSum(
|
|
[param, ONE, grad.indices, grad.values, lr], param
|
|
)
|
|
else:
|
|
if self.momentum > 0.0:
|
|
net.MomentumSGDUpdate(
|
|
[grad, momentum_data, lr, param],
|
|
[grad, momentum_data, param],
|
|
momentum=self.momentum,
|
|
nesterov=self.nesterov,
|
|
)
|
|
else:
|
|
coeff = lr
|
|
|
|
net.WeightedSum([param, ONE, grad, coeff], param)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.base_learning_rate *= scale
|
|
return
|
|
|
|
|
|
class MultiPrecisionSgdOptimizer(SgdOptimizer):
|
|
def __init__(
|
|
self,
|
|
base_learning_rate=0.1,
|
|
momentum=0.0,
|
|
policy="fixed",
|
|
nesterov=True,
|
|
sparse_dedup_aggregator=None,
|
|
**kwargs
|
|
):
|
|
super(MultiPrecisionSgdOptimizer, self).__init__(
|
|
base_learning_rate=base_learning_rate,
|
|
policy=policy,
|
|
momentum=momentum,
|
|
nesterov=nesterov,
|
|
sparse_dedup_aggregator=sparse_dedup_aggregator,
|
|
**kwargs
|
|
)
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
param_fp32 = (
|
|
param_info.blob_copy[core.DataType.FLOAT]
|
|
if param_info.blob_copy is not None
|
|
else None
|
|
)
|
|
|
|
# If we have a straight fp32 parameter, run the base class
|
|
if param_fp32 is None:
|
|
return SgdOptimizer._run(self, net, param_init_net, param_info)
|
|
|
|
grad = param_info.grad
|
|
if self.base_learning_rate == 0:
|
|
return
|
|
assert (
|
|
self.base_learning_rate > 0
|
|
), "Expect positive base learning rate, got {}".format(self.base_learning_rate)
|
|
|
|
lr, _ = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=-self.base_learning_rate,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
momentum_data = param_init_net.ConstantFill(
|
|
param_fp32, str(param) + "_momentum", value=0.0
|
|
)
|
|
self._aux_params.local.append(momentum_data)
|
|
|
|
assert not isinstance(
|
|
grad, core.GradientSlice
|
|
), "MultiPrecisionSgd does not support sparse gradients"
|
|
|
|
# Copy gradient to fp32
|
|
grad_fp32 = net.HalfToFloat(grad, grad + "_fp32")
|
|
|
|
# update (fused) in fp32
|
|
net.MomentumSGDUpdate(
|
|
[grad_fp32, momentum_data, lr, param_fp32],
|
|
[grad_fp32, momentum_data, param_fp32],
|
|
momentum=self.momentum,
|
|
nesterov=self.nesterov,
|
|
)
|
|
|
|
# Copy updated param back to fp16
|
|
net.FloatToHalf(param_fp32, param)
|
|
|
|
|
|
class FP16SgdOptimizer(SgdOptimizer):
|
|
def __init__(
|
|
self,
|
|
base_learning_rate=0.1,
|
|
momentum=0.0,
|
|
policy="fixed",
|
|
nesterov=True,
|
|
weight_decay=0.0001,
|
|
sparse_dedup_aggregator=None,
|
|
**kwargs
|
|
):
|
|
super(FP16SgdOptimizer, self).__init__(
|
|
base_learning_rate=base_learning_rate,
|
|
policy=policy,
|
|
momentum=momentum,
|
|
nesterov=nesterov,
|
|
sparse_dedup_aggregator=sparse_dedup_aggregator,
|
|
**kwargs
|
|
)
|
|
self.weight_decay = weight_decay
|
|
|
|
def _run(self, net, param_init_net, param_info, fp32_update=False):
|
|
|
|
fp32_update_flag = 0
|
|
param_name = str(param_info.blob)
|
|
|
|
# should only be triggered in FP16 training by SpatialBN, which
|
|
# requires FP32 params in CuDNN.
|
|
if param_name.find("spatbn") != -1:
|
|
fp32_update = True
|
|
|
|
if fp32_update:
|
|
# doing a 32bit update
|
|
# Have to assume param_info.blob is FP32 as there is no way
|
|
# (that i currently know of) to query a blob's type in python
|
|
fp32_update_flag = 1
|
|
param = param_info.blob
|
|
param_fp32 = param_info.blob
|
|
else:
|
|
if param_info.blob_copy is None:
|
|
# doing a 32bit update
|
|
# Have to assume param_info.blob is FP32 as there is no way
|
|
# (that i currently know of) to query a blob's type in python
|
|
fp32_update_flag = 1
|
|
param = param_info.blob
|
|
param_fp32 = param_info.blob
|
|
else:
|
|
if core.DataType.FLOAT in param_info.blob_copy:
|
|
param = param_info.blob
|
|
param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
|
|
elif core.DataType.FLOAT16 in param_info.blob_copy:
|
|
param = param_info.blob_copy[core.DataType.FLOAT16]
|
|
param_fp32 = param_info.blob
|
|
else:
|
|
AssertionError(
|
|
"Unrecognized parameter format to be updated "
|
|
"by FP16 Optimizer. Parameter: {}".format(param_info.name)
|
|
)
|
|
|
|
grad = param_info.grad
|
|
|
|
if self.base_learning_rate == 0:
|
|
return
|
|
assert (
|
|
self.base_learning_rate > 0
|
|
), "Expect positive base learning rate, got {}".format(self.base_learning_rate)
|
|
|
|
lr, _ = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=-self.base_learning_rate,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
momentum_data_fp32 = param_init_net.ConstantFill(
|
|
param_fp32, str(param) + "_momentum_fp32", value=0.0
|
|
)
|
|
|
|
momentum_data = param_init_net.FloatToHalf(
|
|
momentum_data_fp32, str(param) + "_momentum"
|
|
)
|
|
|
|
self._aux_params.local.append(momentum_data)
|
|
|
|
assert not isinstance(
|
|
grad, core.GradientSlice
|
|
), "FP16Sgd does not support sparse gradients"
|
|
|
|
if fp32_update_flag == 0:
|
|
net.FP16MomentumSGDUpdate(
|
|
[grad, momentum_data, lr, param],
|
|
[grad, momentum_data, param],
|
|
momentum=self.momentum,
|
|
nesterov=self.nesterov,
|
|
weight_decay=self.weight_decay,
|
|
)
|
|
else:
|
|
# flag set to 1, therefore doing FP32 update
|
|
net.FP32MomentumSGDUpdate(
|
|
[grad, momentum_data_fp32, lr, param],
|
|
[grad, momentum_data_fp32, param],
|
|
momentum=self.momentum,
|
|
nesterov=self.nesterov,
|
|
weight_decay=self.weight_decay,
|
|
)
|
|
|
|
|
|
class WeightDecayBuilder(Optimizer):
|
|
def __init__(self, weight_decay):
|
|
self.weight_decay = weight_decay
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
dev = scope.CurrentDeviceScope()
|
|
if dev is None:
|
|
dev = core.DeviceOption(caffe2_pb2.CPU)
|
|
|
|
ONE = param_init_net.ConstantFill(
|
|
[], "ONE_{}_{}".format(dev.device_type, dev.device_id), shape=[1], value=1.0
|
|
)
|
|
WD = param_init_net.ConstantFill(
|
|
[],
|
|
"wd_{}_{}".format(dev.device_type, dev.device_id),
|
|
shape=[1],
|
|
value=self.weight_decay,
|
|
)
|
|
|
|
if isinstance(param_info.grad, core.GradientSlice):
|
|
raise ValueError("Weight decay does not yet support sparse gradients")
|
|
else:
|
|
net.WeightedSum(
|
|
[param_info.grad, ONE, param_info.blob, WD], param_info.grad
|
|
)
|
|
|
|
|
|
class AdagradOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
alpha=0.01,
|
|
epsilon=1e-4,
|
|
decay=1,
|
|
weight_decay=0.0,
|
|
policy="fixed",
|
|
sparse_dedup_aggregator=None,
|
|
rowWise=False,
|
|
engine="",
|
|
lars=None,
|
|
output_effective_lr=False,
|
|
output_effective_lr_and_update=False,
|
|
pruning_options=None,
|
|
swa_options=None,
|
|
ema_options=None,
|
|
weight_scale=None,
|
|
counter_halflife=-1,
|
|
**kwargs
|
|
):
|
|
for k, v in locals().items():
|
|
logger.info("AdagradOptimizer: input arguments: {}: {}".format(k, v))
|
|
|
|
super(AdagradOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.epsilon = epsilon
|
|
self.decay = decay
|
|
self.weight_decay = float(weight_decay)
|
|
self.policy = policy
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.rowWise = rowWise
|
|
self.engine = engine
|
|
self.lars = lars
|
|
self.output_effective_lr = output_effective_lr
|
|
self.output_effective_lr_and_update = output_effective_lr_and_update
|
|
self.counter_halflife = counter_halflife
|
|
self.init_kwargs = kwargs
|
|
self.weight_scale = weight_scale
|
|
|
|
self._process_pruning_options(pruning_options)
|
|
self._process_swa_options(swa_options)
|
|
self._process_ema_options(ema_options)
|
|
|
|
def _process_swa_options(self, swa_options):
|
|
self.swa_enabled = True if swa_options else False
|
|
if self.swa_enabled:
|
|
self.swa_avg_start_it = swa_options.get("swa_avg_start_it", None)
|
|
self.swa_avg_end_it = swa_options.get("swa_avg_end_it", None)
|
|
self.swa_feedback_start_it = swa_options.get("swa_feedback_start_it", None)
|
|
self.swa_feedback_step = swa_options.get("swa_feedback_step", None)
|
|
self.swa_feedback_end_it = swa_options.get("swa_feedback_end_it", None)
|
|
|
|
def _process_ema_options(self, ema_options):
|
|
self.ema_enabled = True if ema_options else False
|
|
if self.ema_enabled:
|
|
self.ema_start = ema_options.get("ema_start", None)
|
|
self.ema_end = ema_options.get("ema_end", None)
|
|
self.ema_step = ema_options.get("ema_step", None)
|
|
self.ema_alpha = ema_options.get("ema_alpha", None)
|
|
|
|
def _process_pruning_options(self, pruning_options):
|
|
self.use_mask = False
|
|
|
|
if pruning_options is None:
|
|
pruning_options = {}
|
|
else:
|
|
assert isinstance(pruning_options, dict), (
|
|
"pruning_options can only "
|
|
"be provided as a dictionary, currently: {}".format(pruning_options)
|
|
)
|
|
|
|
self.mask_tensor = pruning_options.get("mask_tensor", None)
|
|
self.mask_db_path = pruning_options.get("mask_db_path", None)
|
|
self.mask_db_type = pruning_options.get("mask_db_type", None)
|
|
self.mask_blob_name = pruning_options.get("mask_blob_name", None)
|
|
self.prune_delays = pruning_options.get("prune_delays", [])
|
|
self.prune_ratios = pruning_options.get("prune_ratios", [])
|
|
self.prune_block_size = pruning_options.get("prune_block_size", 1)
|
|
|
|
if self.mask_tensor is not None:
|
|
assert (
|
|
type(self.mask_tensor) is np.ndarray
|
|
), "mask_tensor must be a numpy array!"
|
|
assert self.mask_db_path is None, (
|
|
"mask can be provided through either a numpy array "
|
|
"or a db path, not both"
|
|
)
|
|
assert self.mask_db_type is None, (
|
|
"mask can be provided through either a numpy array "
|
|
"or a db path, not both"
|
|
)
|
|
assert self.mask_blob_name is None, (
|
|
"mask can be provided through either a numpy array "
|
|
"or a db path, not both"
|
|
)
|
|
self.use_mask = True
|
|
|
|
if self.mask_db_path is not None or self.mask_db_type is not None:
|
|
assert self.mask_db_path is not None, (
|
|
"when mask is provided through db, "
|
|
"db path, db type, and blob name are all needed"
|
|
)
|
|
assert self.mask_db_type is not None, (
|
|
"when mask is provided through db, "
|
|
"db path, db type, and blob name are all needed"
|
|
)
|
|
assert self.mask_tensor is None, (
|
|
"mask can be provided through either a numpy array "
|
|
"or a db path, not both"
|
|
)
|
|
self.use_mask = True
|
|
|
|
if self.prune_delays:
|
|
assert self.prune_ratios is not None and len(self.prune_delays) == len(
|
|
self.prune_ratios
|
|
), "Prune Delays and prune ratios should be of the same length"
|
|
assert (
|
|
self.mask_tensor is None
|
|
), "Mask Tensor should be None with prune ratios"
|
|
assert (
|
|
self.mask_db_path is None
|
|
), "Mask DB Path should be None with prune ratios"
|
|
self.use_mask = True
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
self._clear_local_lr_multiplier()
|
|
|
|
if self.lars is not None and not isinstance(grad, core.GradientSlice):
|
|
assert (
|
|
self.weight_decay == 0
|
|
), "weight decay is not implemented for LARS yet"
|
|
assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
|
|
self.lars
|
|
)
|
|
wd, trust, lr_max = self.create_lars_inputs(
|
|
param_init_net, 0.0, 1.0, np.finfo(np.float32).max
|
|
)
|
|
lr_lars_multiplier = net.Lars(
|
|
[param, grad, wd, trust, lr_max],
|
|
self.make_unique_blob_name(str(param) + "_lars"),
|
|
offset=self.lars,
|
|
lr_min=0.0,
|
|
)
|
|
|
|
current_scope = scope.CurrentDeviceScope()
|
|
self._add_local_lr_multiplier(
|
|
lr_lars_multiplier,
|
|
is_gpu_blob=(
|
|
current_scope is not None
|
|
and core.IsGPUDeviceType(current_scope.device_type)
|
|
),
|
|
)
|
|
|
|
lr, lr_iteration = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=self.alpha,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
iteration = lr_iteration
|
|
if self.counter_halflife > 0:
|
|
self._aux_params.shared.append(iteration)
|
|
|
|
if self.rowWise:
|
|
logger.info(
|
|
"Using engine {} for rowWise Adagrad to train param {}".format(
|
|
self.engine, param
|
|
)
|
|
)
|
|
|
|
shapes, types = workspace.InferShapesAndTypes([param_init_net])
|
|
if str(param) not in shapes:
|
|
# Type/shape inference is not available for this param, fallback
|
|
# on Shape/Slice logic
|
|
shape = param_init_net.Shape(param, str(param) + "_shape")
|
|
num_rows = param_init_net.Slice(
|
|
[shape], str(shape) + "_numrows", starts=[0], ends=[1]
|
|
)
|
|
param_squared_sum = param_init_net.ConstantFill(
|
|
num_rows,
|
|
str(param) + "_avg_squared_sum",
|
|
input_as_shape=1,
|
|
value=0.0,
|
|
)
|
|
else:
|
|
param_squared_sum = param_init_net.ConstantFill(
|
|
[],
|
|
str(param) + "_avg_squared_sum",
|
|
shape=[shapes[str(param)][0]],
|
|
value=0.0,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Using engine {} for regular Adagrad to train param {}".format(
|
|
self.engine, param
|
|
)
|
|
)
|
|
|
|
if self.engine in FP16_ENGINES:
|
|
assert (
|
|
self.weight_decay == 0
|
|
), "weight decay is not tested for engine: {}".format(self.engine)
|
|
|
|
shapes, types = workspace.InferShapesAndTypes([param_init_net])
|
|
assert str(param) in shapes, shapes
|
|
shape = shapes[str(param)]
|
|
|
|
param_squared_sum = param_init_net.Float16ConstantFill(
|
|
[], str(param) + "_squared_sum", value=0.0, shape=shape
|
|
)
|
|
else:
|
|
param_squared_sum = param_init_net.ConstantFill(
|
|
[param], str(param) + "_squared_sum", value=0.0
|
|
)
|
|
|
|
if self.use_mask is True:
|
|
assert (
|
|
self.weight_decay == 0
|
|
), "weight decay is not implemented for use_mask yet"
|
|
|
|
if self.mask_tensor is not None:
|
|
if not isinstance(grad, core.GradientSlice):
|
|
mask_blob = param_init_net.GivenTensorFill(
|
|
[],
|
|
[str(param) + "_mask"],
|
|
values=self.mask_tensor,
|
|
shape=self.mask_tensor.shape,
|
|
)
|
|
else:
|
|
self.mask_tensor = self.mask_tensor.astype(np.uint8)
|
|
mask_blob = param_init_net.GivenTensorBoolFill(
|
|
[],
|
|
[str(param) + "_mask"],
|
|
values=self.mask_tensor,
|
|
shape=self.mask_tensor.shape,
|
|
)
|
|
mask_blob = param_init_net.Cast(mask_blob, to=core.DataType.UINT8)
|
|
mask_changed_blob = param_init_net.ConstantFill(
|
|
[],
|
|
[str(param) + "_mask_changed_blob"],
|
|
value=False,
|
|
dtype=core.DataType.BOOL,
|
|
shape=[1],
|
|
)
|
|
elif (
|
|
self.mask_db_path is not None or self.mask_db_type is not None
|
|
): # mask is provided through a db file
|
|
# if mask_blob_name is not given use the param name to derive mask name
|
|
self.mask_blob_name = self.mask_blob_name or str(param) + "_mask"
|
|
|
|
mask_blob = param_init_net.Load(
|
|
[],
|
|
self.mask_blob_name,
|
|
db=self.mask_db_path,
|
|
db_type=self.mask_db_type,
|
|
absolute_path=True,
|
|
)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
mask_changed_blob = param_init_net.ConstantFill(
|
|
[],
|
|
[str(param) + "_mask_changed_blob"],
|
|
value=False,
|
|
dtype=core.DataType.BOOL,
|
|
shape=[1],
|
|
)
|
|
elif self.prune_delays:
|
|
last_mask_updated_iter = param_init_net.ConstantFill(
|
|
[],
|
|
[str(param) + "_last_mask_updated_iter"],
|
|
value=-1,
|
|
dtype=core.DataType.INT64,
|
|
shape=[1],
|
|
)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
AssertionError(
|
|
"Prune Delays and Prune Ratios are currently not supported"
|
|
"for sparse operators"
|
|
)
|
|
else:
|
|
mask_blob = param_init_net.GivenTensorFill(
|
|
[],
|
|
[str(param) + "_empty_mask"],
|
|
values=[],
|
|
dtype=core.DataType.FLOAT,
|
|
shape=[0],
|
|
)
|
|
else:
|
|
raise NotImplementedError(
|
|
"If mask is used, it needs a numpy array or a db file or"
|
|
"a delay iter needs to be provided"
|
|
)
|
|
|
|
self._aux_params.local.append(param_squared_sum)
|
|
if self.counter_halflife > 0:
|
|
shapes, types = workspace.InferShapesAndTypes([param_init_net])
|
|
if str(param) not in shapes:
|
|
shape = param_init_net.Shape(param, str(param) + "_shape")
|
|
num_rows = param_init_net.Slice(
|
|
[shape], str(shape) + "_numrows", starts=[0], ends=[1]
|
|
)
|
|
update_counter = param_init_net.ConstantFill(
|
|
num_rows,
|
|
str(param) + "_update_counter",
|
|
input_as_shape=1,
|
|
value=0.0,
|
|
dtype=core.DataType.DOUBLE,
|
|
)
|
|
prev_update_iter = param_init_net.ConstantFill(
|
|
num_rows,
|
|
str(param) + "_prev_update_iter",
|
|
input_as_shape=1,
|
|
value=0,
|
|
dtype=core.DataType.INT64,
|
|
)
|
|
else:
|
|
update_counter = param_init_net.ConstantFill(
|
|
[],
|
|
str(param) + "_update_counter",
|
|
shape=[shapes[str(param)][0]],
|
|
value=0.0,
|
|
dtype=core.DataType.DOUBLE,
|
|
)
|
|
prev_update_iter = param_init_net.ConstantFill(
|
|
[],
|
|
str(param) + "_prev_update_iter",
|
|
shape=[shapes[str(param)][0]],
|
|
value=0,
|
|
dtype=core.DataType.INT64,
|
|
)
|
|
self._aux_params.local.append(update_counter)
|
|
self._aux_params.local.append(prev_update_iter)
|
|
|
|
if self.rowWise:
|
|
assert isinstance(grad, core.GradientSlice), (
|
|
"If SparseAdagrad with rowWise=True, gradient must be "
|
|
"a gradientslice. PLease ensure that rowWise is not enabled "
|
|
"for the dense Adagrad optimizer, as it is not supported."
|
|
)
|
|
|
|
shapes, _ = workspace.InferShapesAndTypes([param_init_net])
|
|
param_shape = shapes[str(param)]
|
|
weight_decay = 0.0
|
|
if isinstance(grad, core.GradientSlice):
|
|
if len(param_shape) == 1:
|
|
weight_decay = 0.0
|
|
logger.warn(
|
|
"SKIPPING weight decay on 1d sparse param: {}.shape is {}".format(
|
|
str(param), param_shape
|
|
)
|
|
)
|
|
else:
|
|
weight_decay = self.weight_decay
|
|
else:
|
|
# Skip weight decay for 1d parameters
|
|
if len(param_shape) == 1:
|
|
weight_decay = 0.0
|
|
logger.warn(
|
|
"SKIPPING weight decay on 1d dense param: {}.shape is {}".format(
|
|
str(param), param_shape
|
|
)
|
|
)
|
|
else:
|
|
weight_decay = self.weight_decay
|
|
logger.info(
|
|
"weight_decay for {} (shape:{}): {}".format(
|
|
str(param), param_shape, weight_decay
|
|
)
|
|
)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
assert (
|
|
self.decay == 1.0
|
|
), "Decay is not implemented for SparseAdagrad and must be set to 1"
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
|
|
input_args = [param, param_squared_sum, grad.indices, grad.values, lr]
|
|
output_args = [param, param_squared_sum]
|
|
if self.rowWise:
|
|
if self.use_mask is True:
|
|
op = "MaskedRowWiseSparseAdagrad"
|
|
assert (
|
|
weight_decay == 0
|
|
), "weight decay is not implemented for {} yet".format(op)
|
|
input_args += [mask_blob, mask_changed_blob]
|
|
else:
|
|
if self.counter_halflife > 0:
|
|
input_args += [update_counter]
|
|
op = "RowWiseSparseAdagrad"
|
|
else:
|
|
if self.use_mask is True:
|
|
op = "MaskedSparseAdagrad"
|
|
assert (
|
|
weight_decay == 0
|
|
), "weight decay is not implemented for {} yet".format(op)
|
|
input_args += [mask_blob, mask_changed_blob]
|
|
else:
|
|
op = "SparseAdagrad"
|
|
logger.info("using {} for {}".format(op, str(param)))
|
|
|
|
if self.prune_delays:
|
|
input_args += [lr_iteration, last_mask_updated_iter]
|
|
output_args += [mask_blob, last_mask_updated_iter]
|
|
|
|
if weight_decay > 0 and self.counter_halflife == -1:
|
|
net.__getattr__(op)(
|
|
input_args,
|
|
output_args,
|
|
epsilon=self.epsilon,
|
|
weight_decay=weight_decay,
|
|
engine=self.engine,
|
|
)
|
|
elif weight_decay > 0 and self.counter_halflife != -1:
|
|
net.__getattr__(op)(
|
|
input_args,
|
|
output_args,
|
|
epsilon=self.epsilon,
|
|
weight_decay=weight_decay,
|
|
engine=self.engine,
|
|
counter_halflife=self.counter_halflife,
|
|
)
|
|
else:
|
|
net.__getattr__(op)(
|
|
input_args, output_args, epsilon=self.epsilon, engine=self.engine
|
|
)
|
|
if self.counter_halflife > 0:
|
|
net.RowWiseCounter(
|
|
[prev_update_iter, update_counter, grad.indices, iteration],
|
|
[prev_update_iter, update_counter],
|
|
counter_halflife=self.counter_halflife,
|
|
)
|
|
else:
|
|
input_args = [param, param_squared_sum, grad, lr]
|
|
output_args = [param, param_squared_sum]
|
|
|
|
if self.output_effective_lr_and_update:
|
|
assert (
|
|
self.use_mask is False
|
|
), "MaskedAdagrad doesn't support outputting effective_lr_and_update"
|
|
output_args.append(str(param) + "_effective_lr")
|
|
output_args.append(str(param) + "_update")
|
|
elif self.output_effective_lr:
|
|
assert (
|
|
self.use_mask is False
|
|
), "MaskedAdagrad doesn't support outputting effective_lr"
|
|
output_args.append(str(param) + "_effective_lr")
|
|
|
|
if self.use_mask is True:
|
|
input_args += [mask_blob]
|
|
|
|
if self.prune_delays:
|
|
input_args += [lr_iteration, last_mask_updated_iter]
|
|
output_args += [mask_blob, last_mask_updated_iter]
|
|
|
|
if self.use_mask:
|
|
assert (
|
|
weight_decay == 0
|
|
), "weight decay is not implemented for use_mask yet"
|
|
net.MaskedAdagrad(
|
|
input_args,
|
|
output_args,
|
|
epsilon=self.epsilon,
|
|
decay=float(self.decay),
|
|
block_size=self.prune_block_size,
|
|
delays=self.prune_delays,
|
|
prune_ratios=self.prune_ratios,
|
|
engine=self.engine,
|
|
)
|
|
else:
|
|
if weight_decay > 0:
|
|
net.Adagrad(
|
|
input_args,
|
|
output_args,
|
|
epsilon=self.epsilon,
|
|
decay=float(self.decay),
|
|
weight_decay=weight_decay,
|
|
engine=self.engine,
|
|
)
|
|
else:
|
|
net.Adagrad(
|
|
input_args,
|
|
output_args,
|
|
epsilon=self.epsilon,
|
|
decay=float(self.decay),
|
|
engine=self.engine,
|
|
)
|
|
|
|
if self.swa_enabled:
|
|
param_swa = str(param) + "_swa"
|
|
if not param_init_net.BlobIsDefined(param_swa):
|
|
param_init_net.ConstantFill([param], param_swa, value=0.0)
|
|
self._aux_params.local.append(param_swa)
|
|
|
|
net.SWA(
|
|
[param, param_swa, lr_iteration],
|
|
[param, param_swa],
|
|
avg_start=self.swa_avg_start_it,
|
|
avg_end=self.swa_avg_end_it,
|
|
feedback_start=self.swa_feedback_start_it,
|
|
feedback_step=self.swa_feedback_step,
|
|
feedback_end=self.swa_feedback_end_it,
|
|
)
|
|
|
|
if self.ema_enabled:
|
|
param_ema = str(param) + "_ema"
|
|
if not param_init_net.BlobIsDefined(param_ema):
|
|
param_init_net.ConstantFill([param], param_ema, value=0.0)
|
|
self._aux_params.local.append(param_ema)
|
|
|
|
net.EMA(
|
|
[param, param_ema, lr_iteration],
|
|
[param, param_ema],
|
|
ema_start=self.ema_start,
|
|
ema_end=self.ema_end,
|
|
ema_step=self.ema_step,
|
|
ema_alpha=self.ema_alpha,
|
|
)
|
|
|
|
if self.weight_scale:
|
|
net.WeightScale(
|
|
[param, lr_iteration],
|
|
[param],
|
|
stepsize=self.weight_scale.stepsize,
|
|
upper_bound_iter=self.weight_scale.upper_bound_iter,
|
|
scale=float(self.weight_scale.scale),
|
|
)
|
|
if self.weight_scale.to_aux:
|
|
net.WeightScale(
|
|
[param_squared_sum, lr_iteration],
|
|
[param_squared_sum],
|
|
stepsize=self.weight_scale.stepsize,
|
|
upper_bound_iter=self.weight_scale.upper_bound_iter,
|
|
scale=float(self.weight_scale.scale),
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
|
|
class WngradOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
alpha=1.0,
|
|
epsilon=1e-9,
|
|
policy="fixed",
|
|
sparse_dedup_aggregator=None,
|
|
engine="",
|
|
moment_init=100.0,
|
|
lars=None,
|
|
output_effective_lr=False,
|
|
output_effective_lr_and_update=False,
|
|
**kwargs
|
|
):
|
|
super(WngradOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.epsilon = epsilon
|
|
self.policy = policy
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.engine = engine
|
|
self.moment_init = moment_init
|
|
self.lars = lars
|
|
self.output_effective_lr = output_effective_lr
|
|
self.output_effective_lr_and_update = output_effective_lr_and_update
|
|
self.init_kwargs = kwargs
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
self._clear_local_lr_multiplier()
|
|
|
|
if self.lars is not None and not isinstance(grad, core.GradientSlice):
|
|
assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
|
|
self.lars
|
|
)
|
|
wd, trust, lr_max = self.create_lars_inputs(
|
|
param_init_net, 0.0, 1.0, np.finfo(np.float32).max
|
|
)
|
|
lr_lars_multiplier = net.Lars(
|
|
[param, grad, wd, trust, lr_max],
|
|
self.make_unique_blob_name(str(param) + "_lars"),
|
|
offset=self.lars,
|
|
lr_min=0.0,
|
|
)
|
|
current_scope = scope.CurrentDeviceScope()
|
|
self._add_local_lr_multiplier(
|
|
lr_lars_multiplier,
|
|
is_gpu_blob=(
|
|
current_scope is not None
|
|
and core.IsGPUDeviceType(current_scope.device_type)
|
|
),
|
|
)
|
|
|
|
lr, _ = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=self.alpha,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
moment = param_init_net.ConstantFill(
|
|
[], str(param) + "_moment", shape=[1], value=self.moment_init
|
|
)
|
|
|
|
self._aux_params.local.append(moment)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
net.SparseWngrad(
|
|
[param, moment, grad.indices, grad.values, lr],
|
|
[param, moment],
|
|
epsilon=self.epsilon,
|
|
engine=self.engine,
|
|
)
|
|
else:
|
|
output_args = [param, moment]
|
|
if self.output_effective_lr_and_update:
|
|
output_args.append(str(param) + "_effective_lr")
|
|
output_args.append(str(param) + "_update")
|
|
elif self.output_effective_lr:
|
|
output_args.append(str(param) + "_effective_lr")
|
|
|
|
net.Wngrad(
|
|
[param, moment, grad, lr],
|
|
output_args,
|
|
epsilon=self.epsilon,
|
|
engine=self.engine,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
|
|
class StormOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
lr=0.1,
|
|
momentum=10.0,
|
|
beta=0.1,
|
|
grad_sq_init=0.01,
|
|
policy="fixed",
|
|
sparse_dedup_aggregator=None,
|
|
lars=None,
|
|
**kwargs
|
|
):
|
|
"""Constructor function to add STORM Optimizer
|
|
|
|
Args:
|
|
lr: learning rate scaling (called k in the original paper)
|
|
momentum: momentum scaling (called c in the original paper)
|
|
beta: initial value of denominator in adaptive learning rate (
|
|
called c in the original paper)
|
|
grad_sq_init: initial value of gradient squared accumulator.
|
|
policy: specifies how learning rate should be applied, options are
|
|
'fixed', 'step', 'exp', etc.
|
|
sparse_dedup_aggregator: specifies deduplication strategy for
|
|
gradient slices. Works while using sparse gradients. Options
|
|
include 'mean' and 'sum'.
|
|
lars: lars offset.
|
|
"""
|
|
super(StormOptimizer, self).__init__()
|
|
self.lr = lr
|
|
self.momentum = momentum
|
|
self.beta = beta
|
|
self.grad_sq_init = grad_sq_init
|
|
self.policy = policy
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.lars = lars
|
|
self.init_kwargs = kwargs
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.lr <= 0:
|
|
return
|
|
|
|
self._clear_local_lr_multiplier()
|
|
|
|
if self.lars is not None and not isinstance(grad, core.GradientSlice):
|
|
assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
|
|
self.lars
|
|
)
|
|
wd, trust, lr_max = self.create_lars_inputs(
|
|
param_init_net, 0.0, 1.0, np.finfo(np.float32).max
|
|
)
|
|
lr_lars_multiplier = net.Lars(
|
|
[param, grad, wd, trust, lr_max],
|
|
self.make_unique_blob_name(str(param) + "_lars"),
|
|
offset=self.lars,
|
|
lr_min=0.0,
|
|
)
|
|
current_scope = scope.CurrentDeviceScope()
|
|
self._add_local_lr_multiplier(
|
|
lr_lars_multiplier,
|
|
is_gpu_blob=(
|
|
current_scope is not None
|
|
and core.IsGPUDeviceType(current_scope.device_type)
|
|
),
|
|
)
|
|
|
|
lr, _ = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=self.lr,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
moment = param_init_net.ConstantFill(param, str(param) + "_moment", value=0.0)
|
|
self._aux_params.local.append(moment)
|
|
|
|
grad_sq_sum = param_init_net.ConstantFill(
|
|
[], str(param) + "_grad_sq_sum", shape=[1], value=self.grad_sq_init
|
|
)
|
|
self._aux_params.local.append(grad_sq_sum)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
net.SparseStorm(
|
|
[param, moment, grad_sq_sum, grad.values, grad.indices, lr],
|
|
[param, moment, grad_sq_sum],
|
|
momentum=self.momentum,
|
|
beta=self.beta,
|
|
)
|
|
else:
|
|
net.Storm(
|
|
[param, moment, grad_sq_sum, grad, lr],
|
|
[param, moment, grad_sq_sum],
|
|
momentum=self.momentum,
|
|
beta=self.beta,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.lr *= scale
|
|
|
|
|
|
class AdadeltaOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
alpha=0.01,
|
|
epsilon=1e-4,
|
|
decay=0.95,
|
|
policy="fixed",
|
|
sparse_dedup_aggregator=None,
|
|
engine="",
|
|
**kwargs
|
|
):
|
|
"""Constructor function to add Adadelta Optimizer
|
|
|
|
Args:
|
|
alpha: learning rate
|
|
epsilon: attribute of Adadelta to avoid numerical issues
|
|
decay: attribute of Adadelta to decay the squared gradient sum
|
|
policy: specifies how learning rate should be applied, options are
|
|
"fixed", "step", "exp", etc.
|
|
sparse_dedup_aggregator: specifies deduplication strategy for
|
|
gradient slices. Works while using sparse gradients. Options
|
|
include "mean" and "sum".
|
|
engine: the engine used, options include "", "CUDNN", etc.
|
|
"""
|
|
super(AdadeltaOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.epsilon = epsilon
|
|
self.decay = decay
|
|
self.policy = policy
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.engine = engine
|
|
self.init_kwargs = kwargs
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
lr, _ = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=self.alpha,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
moment = param_init_net.ConstantFill(
|
|
[param], str(param) + "_squared_moment", value=0.0
|
|
)
|
|
|
|
moment_update = param_init_net.ConstantFill(
|
|
[param], str(param) + "_squared_moment_update", value=0.0
|
|
)
|
|
|
|
self._aux_params.local.append(moment)
|
|
self._aux_params.local.append(moment_update)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
net.SparseAdadelta(
|
|
[param, moment, moment_update, grad.indices, grad.values, lr],
|
|
[param, moment, moment_update],
|
|
epsilon=self.epsilon,
|
|
decay=self.decay,
|
|
engine=self.engine,
|
|
)
|
|
else:
|
|
net.Adadelta(
|
|
[param, moment, moment_update, grad, lr],
|
|
[param, moment, moment_update],
|
|
epsilon=self.epsilon,
|
|
decay=self.decay,
|
|
engine=self.engine,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
|
|
class FtrlOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
alpha=0.01,
|
|
beta=1e-4,
|
|
lambda1=0,
|
|
lambda2=0,
|
|
sparse_dedup_aggregator=None,
|
|
engine="",
|
|
):
|
|
super(FtrlOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.beta = beta
|
|
self.lambda1 = lambda1
|
|
self.lambda2 = lambda2
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.engine = engine
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
nz = param_init_net.ConstantFill(
|
|
[param], str(param) + "_ftrl_nz", extra_shape=[2], value=0.0
|
|
)
|
|
self._aux_params.local.append(nz)
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
net.SparseFtrl(
|
|
[param, nz, grad.indices, grad.values],
|
|
[param, nz],
|
|
engine=self.engine,
|
|
alpha=self.alpha,
|
|
beta=self.beta,
|
|
lambda1=self.lambda1,
|
|
lambda2=self.lambda2,
|
|
)
|
|
else:
|
|
net.Ftrl(
|
|
[param, nz, grad],
|
|
[param, nz],
|
|
engine=self.engine,
|
|
alpha=self.alpha,
|
|
beta=self.beta,
|
|
lambda1=self.lambda1,
|
|
lambda2=self.lambda2,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
|
|
class GFtrlOptimizer(Optimizer):
|
|
"""Group Lasso FTRL Optimizer."""
|
|
|
|
def __init__(
|
|
self,
|
|
alpha=0.01,
|
|
beta=1e-4,
|
|
lambda1=0,
|
|
lambda2=0,
|
|
sparse_dedup_aggregator=None,
|
|
engine="",
|
|
):
|
|
super(GFtrlOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.beta = beta
|
|
self.lambda1 = lambda1
|
|
self.lambda2 = lambda2
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.engine = engine
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
nz = param_init_net.ConstantFill(
|
|
[param], str(param) + "_gftrl_nz", extra_shape=[2], value=0.0
|
|
)
|
|
self._aux_params.local.append(nz)
|
|
net.GFtrl(
|
|
[param, nz, grad],
|
|
[param, nz],
|
|
engine=self.engine,
|
|
alpha=self.alpha,
|
|
beta=self.beta,
|
|
lambda1=self.lambda1,
|
|
lambda2=self.lambda2,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
|
|
class AdamOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
alpha=0.001,
|
|
beta1=0.9,
|
|
beta2=0.999,
|
|
epsilon=1e-8,
|
|
policy="fixed",
|
|
use_lr_adaption=False,
|
|
lr_alpha=0.01,
|
|
normalized_lr_adaption=True,
|
|
sparse_dedup_aggregator=None,
|
|
rowWise=False,
|
|
engine="",
|
|
enableRAdam=False,
|
|
use_smart_decay=False, # See https://fburl.com/2jdiwrhy for context.
|
|
**kwargs
|
|
):
|
|
super(AdamOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.beta1 = beta1
|
|
self.beta2 = beta2
|
|
self.epsilon = epsilon
|
|
self.policy = policy
|
|
self.use_lr_adaption = use_lr_adaption
|
|
self.lr_alpha = lr_alpha
|
|
self.normalized_lr_adaption = normalized_lr_adaption
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.rowWise = rowWise
|
|
self.engine = engine
|
|
self.enableRAdam = enableRAdam
|
|
if use_smart_decay:
|
|
if rowWise:
|
|
raise NotImplementedError(('Smart decay is not implemented for rowWise Adam. '
|
|
'Set rowWise or use_smart_decay to False.'))
|
|
if enableRAdam:
|
|
raise NotImplementedError(('Smart decay is not implemented for RAdam. '
|
|
'Set enableRAdam or use_smart_decay to False.'))
|
|
if use_lr_adaption:
|
|
raise NotImplementedError(('Smart decay is not implemented with lr_adaption. '
|
|
'Set use_lr_adaption or use_smart_decay to False.'))
|
|
|
|
self.use_smart_decay = use_smart_decay
|
|
self.init_kwargs = kwargs
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
lr, iteration = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=self.alpha,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
m1 = param_init_net.ConstantFill([param], param + "_first_moment", value=0.0)
|
|
|
|
if self.rowWise:
|
|
shapes, types = workspace.InferShapesAndTypes([param_init_net])
|
|
m2 = param_init_net.ConstantFill(
|
|
[], param + "_avg_second_moment", shape=[shapes[param][0]], value=0.0
|
|
)
|
|
else:
|
|
m2 = param_init_net.ConstantFill(
|
|
[param], param + "_second_moment", value=0.0
|
|
)
|
|
|
|
# Initialize "minibatch in which this parameter was last seen" for smart decay.
|
|
if self.use_smart_decay:
|
|
shapes, _ = workspace.InferShapesAndTypes([param_init_net])
|
|
last_seen = param_init_net.ConstantFill(
|
|
[], param + "_last_seen", shape=[shapes[param][0]], value=0, dtype=core.DataType.INT64
|
|
)
|
|
self._aux_params.local.append(last_seen)
|
|
|
|
self._aux_params.shared.append(iteration)
|
|
self._aux_params.local.append(m1)
|
|
self._aux_params.local.append(m2)
|
|
|
|
if self.rowWise:
|
|
assert isinstance(grad, core.GradientSlice), (
|
|
"If SparseAdam with rowWise=True, gradient must be "
|
|
"a gradientslice. PLease ensure that rowWise is not enabled "
|
|
"for the dense Adam optimizer, as it is not supported."
|
|
)
|
|
|
|
output_blobs = [param, m1, m2]
|
|
|
|
if self.use_smart_decay:
|
|
output_blobs.append(last_seen)
|
|
|
|
if self.use_lr_adaption:
|
|
effective_grad = str(param) + "_effective_grad"
|
|
output_blobs.append(effective_grad)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
if self.rowWise:
|
|
op = "RowWiseSparseAdam"
|
|
elif self.use_smart_decay:
|
|
op = "SmartDecaySparseAdam"
|
|
else:
|
|
op = "SparseAdam"
|
|
|
|
# Currently, only SparseAdam support RAdam, other Adam Ops will support later
|
|
if op == "SparseAdam":
|
|
net.__getattr__(op)(
|
|
[param, m1, m2, grad.indices, grad.values, lr, iteration],
|
|
output_blobs,
|
|
beta1=self.beta1,
|
|
beta2=self.beta2,
|
|
epsilon=self.epsilon,
|
|
enableRAdam=self.enableRAdam,
|
|
)
|
|
elif op == "SmartDecaySparseAdam":
|
|
net.__getattr__(op)(
|
|
[param, m1, m2, last_seen, grad.indices, grad.values, lr, iteration],
|
|
output_blobs,
|
|
beta1=self.beta1,
|
|
beta2=self.beta2,
|
|
epsilon=self.epsilon,
|
|
)
|
|
else:
|
|
assert (
|
|
not self.enableRAdam
|
|
), "Currently, RowWiseSparseAdam is not supported by RAdam!"
|
|
net.__getattr__(op)(
|
|
[param, m1, m2, grad.indices, grad.values, lr, iteration],
|
|
output_blobs,
|
|
beta1=self.beta1,
|
|
beta2=self.beta2,
|
|
epsilon=self.epsilon,
|
|
)
|
|
|
|
if self.use_lr_adaption:
|
|
net.LearningRateAdaption(
|
|
[lr, grad.values, effective_grad],
|
|
[lr],
|
|
lr_alpha=self.lr_alpha,
|
|
normalized_lr_adaption=self.normalized_lr_adaption,
|
|
)
|
|
|
|
else:
|
|
net.Adam(
|
|
[param, m1, m2, grad, lr, iteration],
|
|
output_blobs,
|
|
beta1=self.beta1,
|
|
beta2=self.beta2,
|
|
epsilon=self.epsilon,
|
|
)
|
|
if self.use_lr_adaption:
|
|
net.LearningRateAdaption(
|
|
[lr, grad, effective_grad],
|
|
[lr],
|
|
lr_alpha=self.lr_alpha,
|
|
normalized_lr_adaption=self.normalized_lr_adaption,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
class DecayAdagradOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
alpha=0.01,
|
|
beta1=0.0,
|
|
beta2=0.999,
|
|
epsilon=0.1,
|
|
weight_decay=0.0,
|
|
ema_options=None,
|
|
bias_correction_first=True,
|
|
policy="fixed",
|
|
engine="",
|
|
**kwargs
|
|
):
|
|
super(DecayAdagradOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.beta1 = beta1
|
|
self.beta2 = beta2
|
|
self.epsilon = epsilon
|
|
self.weight_decay = weight_decay
|
|
self.bias_correction_first = bias_correction_first
|
|
self.policy = policy
|
|
self.engine = engine
|
|
self.init_kwargs = kwargs
|
|
self._process_ema_options(ema_options)
|
|
|
|
def _process_ema_options(self, ema_options):
|
|
self.ema_enabled = True if ema_options else False
|
|
if self.ema_enabled:
|
|
self.ema_start = ema_options.get("ema_start", None)
|
|
self.ema_end = ema_options.get("ema_end", None)
|
|
self.ema_step = ema_options.get("ema_step", None)
|
|
self.ema_alpha = ema_options.get("ema_alpha", None)
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
lr, iteration = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=self.alpha,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
# hack for position weighted.
|
|
param_squared_sum = param_init_net.ConstantFill([param], param + "_squared_sum", value=0.0)
|
|
self._aux_params.local.append(param_squared_sum)
|
|
output_blobs = [param, param_squared_sum]
|
|
net.SparseAdagrad(
|
|
[param, param_squared_sum, grad.indices, grad.values, lr],
|
|
output_blobs,
|
|
epsilon=self.epsilon,
|
|
)
|
|
else:
|
|
m1 = param_init_net.ConstantFill([param], param + "_first_mo1ment", value=0.0)
|
|
m2 = param_init_net.ConstantFill([param], param + "_second_moment", value=0.0)
|
|
self._aux_params.shared.append(iteration)
|
|
self._aux_params.local.append(m1)
|
|
self._aux_params.local.append(m2)
|
|
output_blobs = [param, m1, m2]
|
|
net.DecayAdagrad(
|
|
[param, m1, m2, grad, lr, iteration],
|
|
output_blobs,
|
|
beta1=self.beta1,
|
|
beta2=self.beta2,
|
|
epsilon=self.epsilon,
|
|
weight_decay=self.weight_decay,
|
|
bias_correction_first=self.bias_correction_first,
|
|
)
|
|
|
|
if self.ema_enabled:
|
|
param_ema = str(param) + "_ema"
|
|
if not param_init_net.BlobIsDefined(param_ema):
|
|
param_init_net.ConstantFill([param], param_ema, value=0.0)
|
|
self._aux_params.local.append(param_ema)
|
|
|
|
net.EMA(
|
|
[param, param_ema, iteration],
|
|
[param, param_ema],
|
|
ema_start=self.ema_start,
|
|
ema_end=self.ema_end,
|
|
ema_step=self.ema_step,
|
|
ema_alpha=self.ema_alpha,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
class YellowFinOptimizer(Optimizer):
|
|
"""YellowFin: An automatic tuner for momentum SGD
|
|
|
|
See https://arxiv.org/abs/1706.03471 for more details. This implementation
|
|
has separate learning rate and momentum per each parameter."""
|
|
|
|
def __init__(
|
|
self,
|
|
alpha=0.1,
|
|
mu=0.0,
|
|
beta=0.999,
|
|
curv_win_width=20,
|
|
zero_debias=True,
|
|
epsilon=0.1 ** 6,
|
|
policy="fixed",
|
|
sparse_dedup_aggregator=None,
|
|
**kwargs
|
|
):
|
|
super(YellowFinOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.mu = mu
|
|
self.beta = beta
|
|
self.curv_win_width = curv_win_width
|
|
self.zero_debias = zero_debias
|
|
self.epsilon = epsilon
|
|
self.policy = policy
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.init_kwargs = kwargs
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
|
|
# Note: This is number of persistent scalars in YellowFin optimizer.
|
|
# It should always be the number of scalars being used. The same
|
|
# number should be used in class for the operation.
|
|
SCALARS_MEMORY_SIZE = 5
|
|
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
moment = param_init_net.ConstantFill([param], param + "_moment", value=0.0)
|
|
curv_win = param_init_net.ConstantFill(
|
|
[], param + "_curv_win", shape=[self.curv_win_width], value=0.0
|
|
)
|
|
g_avg = param_init_net.ConstantFill([param], param + "_g_avg", value=0.0)
|
|
g2_avg = param_init_net.ConstantFill([param], param + "_g2_avg", value=0.0)
|
|
lr_avg = param_init_net.ConstantFill(
|
|
[], param + "_lr_avg", shape=[1], value=self.alpha
|
|
)
|
|
mu_avg = param_init_net.ConstantFill(
|
|
[], param + "_mu_avg", shape=[1], value=self.mu
|
|
)
|
|
scalars_memory = param_init_net.ConstantFill(
|
|
[], param + "_scalars_memory", shape=[SCALARS_MEMORY_SIZE], value=0.0
|
|
)
|
|
|
|
assert self.alpha > 0
|
|
assert not isinstance(
|
|
grad, core.GradientSlice
|
|
), "YellowFin does not support sparse gradients"
|
|
|
|
iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=0)
|
|
|
|
self._aux_params.shared.append(iteration)
|
|
self._aux_params.local.append(moment)
|
|
self._aux_params.local.append(lr_avg)
|
|
self._aux_params.local.append(mu_avg)
|
|
self._aux_params.local.append(curv_win)
|
|
self._aux_params.local.append(g_avg)
|
|
self._aux_params.local.append(g2_avg)
|
|
self._aux_params.local.append(scalars_memory)
|
|
|
|
yf_in_out_args = [
|
|
param,
|
|
moment,
|
|
lr_avg,
|
|
mu_avg,
|
|
curv_win,
|
|
g_avg,
|
|
g2_avg,
|
|
scalars_memory,
|
|
]
|
|
|
|
net.YellowFin(
|
|
yf_in_out_args + [grad, iteration],
|
|
yf_in_out_args,
|
|
beta=self.beta,
|
|
epsilon=self.epsilon,
|
|
curv_win_width=self.curv_win_width,
|
|
zero_debias=self.zero_debias,
|
|
)
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
|
|
class RmsPropOptimizer(Optimizer):
|
|
def __init__(
|
|
self,
|
|
alpha=0.01,
|
|
decay=0.9,
|
|
momentum=0.0,
|
|
epsilon=1e-5,
|
|
policy="fixed",
|
|
engine="",
|
|
**kwargs
|
|
):
|
|
super(RmsPropOptimizer, self).__init__()
|
|
self.alpha = alpha
|
|
self.decay = decay
|
|
self.momentum = momentum
|
|
self.epsilon = epsilon
|
|
self.policy = policy
|
|
self.engine = engine
|
|
self.init_kwargs = kwargs
|
|
|
|
def _run(self, net, param_init_net, param_info):
|
|
param = param_info.blob
|
|
grad = param_info.grad
|
|
|
|
assert self.alpha > 0
|
|
assert not isinstance(
|
|
grad, core.GradientSlice
|
|
), "RmsPropOptimizer doesn't support sparse gradients"
|
|
|
|
dev = scope.CurrentDeviceScope()
|
|
if dev is None:
|
|
dev = core.DeviceOption(caffe2_pb2.CPU)
|
|
|
|
ONE = param_init_net.ConstantFill(
|
|
[], "ONE_{}_{}".format(dev.device_type, dev.device_id), shape=[1], value=1.0
|
|
)
|
|
|
|
lr, _ = self.build_lr(
|
|
net,
|
|
param_init_net,
|
|
base_learning_rate=-self.alpha,
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
grad_o = param_init_net.ConstantFill(
|
|
[param], str(param) + "_grad_o", values=0.0
|
|
)
|
|
|
|
ms = param_init_net.ConstantFill(
|
|
[param], str(param) + "_mean_squares", values=0.0
|
|
)
|
|
|
|
mom = param_init_net.ConstantFill([param], str(param) + "_momentum", values=0.0)
|
|
|
|
self._aux_params.local.append(ms)
|
|
self._aux_params.local.append(mom)
|
|
|
|
net.RmsProp(
|
|
[grad, ms, mom, ONE],
|
|
[grad_o, ms, mom],
|
|
decay=self.decay,
|
|
momentum=self.momentum,
|
|
epsilon=self.epsilon,
|
|
engine=self.engine,
|
|
)
|
|
|
|
net.MomentumSGDUpdate([grad_o, mom, lr, param], [grad_o, mom, param])
|
|
|
|
def scale_learning_rate(self, scale):
|
|
self.alpha *= scale
|
|
return
|
|
|
|
|
|
def _get_param_to_device(model):
|
|
# Infer blob devices by going through the net and param_init_net
|
|
# ops and observing the device used to create or use the blob.
|
|
param_to_device = core.InferBlobDevices(model.net)
|
|
param_to_device.update(core.InferBlobDevices(model.param_init_net))
|
|
return param_to_device
|
|
|
|
|
|
def get_param_device(param_name, grad, param_to_device=None, default_device=None):
|
|
device = default_device
|
|
param_to_device = param_to_device or {}
|
|
# We first check if parameter's device has been inferred. If not,
|
|
# we check the gradient. This can happen if parameter is not output
|
|
# by any blob but created by a FetchBlob.
|
|
if param_name in param_to_device:
|
|
device = param_to_device[param_name]
|
|
else:
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = grad
|
|
if str(grad.values) in param_to_device:
|
|
device = param_to_device[str(grad.values)]
|
|
elif str(grad.indices) in param_to_device:
|
|
device = param_to_device[str(grad.indices)]
|
|
else:
|
|
grad_name = str(grad)
|
|
if grad_name in param_to_device:
|
|
device = param_to_device[grad_name]
|
|
|
|
assert device is not None, "Cannot infer device for {}: no op creates it".format(
|
|
param_name
|
|
)
|
|
return device
|
|
|
|
|
|
def get_lr_injection():
|
|
"""
|
|
Gets current value for lr_injection, a multiplier for all base
|
|
learning rates.
|
|
Must set allow_lr_injection=True when building optimizer, as it
|
|
relies on synchronization over CPU.
|
|
"""
|
|
return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
|
|
|
|
|
|
def set_lr_injection(lr_injection_value):
|
|
"""
|
|
Sets lr_injection, a multiplier for all base learning rates.
|
|
Must set allow_lr_injection=True when building optimizer, as it
|
|
relies on synchronization over CPU.
|
|
"""
|
|
workspace.FeedBlob(
|
|
_LEARNING_RATE_INJECTION,
|
|
np.array([float(lr_injection_value)], dtype=np.float32),
|
|
)
|
|
|
|
|
|
def _calc_norm_ratio(model, params, name_scope, param_to_device, max_gradient_norm):
|
|
with core.NameScope(name_scope):
|
|
grad_squared_sums = []
|
|
for i, param in enumerate(params):
|
|
device = get_param_device(str(param.blob), param.grad, param_to_device)
|
|
|
|
with core.DeviceScope(device):
|
|
grad = (
|
|
param.grad
|
|
if not isinstance(param.grad, core.GradientSlice)
|
|
else param.grad.values
|
|
)
|
|
|
|
grad_squared_sum_name = "grad_{}_squared_sum".format(i)
|
|
grad_squared_sum = model.net.SumSqrElements(grad, grad_squared_sum_name)
|
|
grad_squared_sum_cpu = model.net.EnsureCPUOutput(grad_squared_sum)
|
|
grad_squared_sums.append(grad_squared_sum_cpu)
|
|
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
|
grad_squared_full_sum = model.net.Sum(
|
|
grad_squared_sums, "grad_squared_full_sum"
|
|
)
|
|
global_norm = model.net.Pow(
|
|
grad_squared_full_sum, "global_norm", exponent=0.5
|
|
)
|
|
clip_norm = model.param_init_net.ConstantFill(
|
|
[], "clip_norm", shape=[], value=float(max_gradient_norm)
|
|
)
|
|
max_norm = model.net.Max([global_norm, clip_norm], "max_norm")
|
|
norm_ratio = model.net.Div([clip_norm, max_norm], "norm_ratio")
|
|
return norm_ratio
|
|
|
|
|
|
def _build(
|
|
model,
|
|
optimizer,
|
|
weights_only=False,
|
|
use_param_info_optim=True,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
):
|
|
param_to_device = _get_param_to_device(model)
|
|
|
|
# Validate there are no duplicate params
|
|
model.Validate()
|
|
|
|
params = []
|
|
for param_info in model.GetOptimizationParamInfo():
|
|
if weights_only and param_info.blob not in model.weights:
|
|
continue
|
|
params.append(param_info)
|
|
|
|
lr_multiplier = None
|
|
if max_gradient_norm is not None:
|
|
lr_multiplier = _calc_norm_ratio(
|
|
model,
|
|
params,
|
|
"norm_clipped_grad_update",
|
|
param_to_device,
|
|
max_gradient_norm,
|
|
)
|
|
|
|
if allow_lr_injection:
|
|
if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
|
|
lr_injection = model.param_init_net.ConstantFill(
|
|
[], _LEARNING_RATE_INJECTION, shape=[1], value=1.0
|
|
)
|
|
else:
|
|
lr_injection = _LEARNING_RATE_INJECTION
|
|
|
|
if lr_multiplier is None:
|
|
lr_multiplier = lr_injection
|
|
else:
|
|
lr_multiplier = model.net.Mul(
|
|
[lr_multiplier, lr_injection], "lr_multiplier", broadcast=1
|
|
)
|
|
optimizer.add_lr_multiplier(lr_multiplier)
|
|
|
|
for param_info in params:
|
|
param_name = str(param_info.blob)
|
|
device = get_param_device(param_name, param_info.grad, param_to_device)
|
|
with core.DeviceScope(device):
|
|
if param_info.optimizer and use_param_info_optim:
|
|
param_info.optimizer(model.net, model.param_init_net, param_info)
|
|
else:
|
|
optimizer(model.net, model.param_init_net, param_info)
|
|
return optimizer
|
|
|
|
|
|
def add_weight_decay(model, weight_decay):
|
|
"""Adds a decay to weights in the model.
|
|
|
|
This is a form of L2 regularization.
|
|
|
|
Args:
|
|
weight_decay: strength of the regularization
|
|
"""
|
|
_build(
|
|
model,
|
|
WeightDecayBuilder(weight_decay=weight_decay),
|
|
weights_only=True,
|
|
use_param_info_optim=False,
|
|
)
|
|
|
|
|
|
def build_sgd(
|
|
model,
|
|
base_learning_rate,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
sgd_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
|
|
def build_multi_precision_sgd(
|
|
model,
|
|
base_learning_rate,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
multi_prec_sgd_optimizer = MultiPrecisionSgdOptimizer(base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
multi_prec_sgd_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
|
|
def build_fp16_sgd(model, base_learning_rate, **kwargs):
|
|
fp16_sgd_optimizer = FP16SgdOptimizer(base_learning_rate, **kwargs)
|
|
return _build(model, fp16_sgd_optimizer)
|
|
|
|
|
|
def build_ftrl(model, engine="SIMD", **kwargs):
|
|
if engine == "SIMD":
|
|
assert core.IsOperator("Ftrl_ENGINE_SIMD")
|
|
assert core.IsOperator("SparseFtrl_ENGINE_SIMD")
|
|
ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
|
|
return _build(model, ftrl_optimizer)
|
|
|
|
|
|
def build_gftrl(model, engine="", **kwargs):
|
|
if engine == "SIMD":
|
|
assert core.IsOperator("GFtrl_ENGINE_SIMD")
|
|
gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
|
|
return _build(model, gftrl_optimizer)
|
|
|
|
|
|
def build_adagrad(
|
|
model,
|
|
base_learning_rate,
|
|
parameters=None,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
adagrad_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
|
|
def build_wngrad(
|
|
model,
|
|
base_learning_rate,
|
|
parameters=None,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
wngrad_optimizer = WngradOptimizer(alpha=base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
wngrad_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
|
|
def build_storm(
|
|
model,
|
|
base_learning_rate,
|
|
parameters=None,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
storm_optimizer = StormOptimizer(lr=base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
storm_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
|
|
def build_adadelta(
|
|
model,
|
|
base_learning_rate,
|
|
parameters=None,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
adadelta_optimizer = AdadeltaOptimizer(alpha=base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
adadelta_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
|
|
def build_adam(
|
|
model,
|
|
base_learning_rate,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
adam_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
def build_decay_adagrad(
|
|
model,
|
|
base_learning_rate,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
decay_adagrad_optimizer = DecayAdagradOptimizer(alpha=base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
decay_adagrad_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|
|
|
|
def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
|
|
yellowfin_optimizer = YellowFinOptimizer(alpha=base_learning_rate, **kwargs)
|
|
return _build(model, yellowfin_optimizer)
|
|
|
|
|
|
def build_rms_prop(
|
|
model,
|
|
base_learning_rate,
|
|
max_gradient_norm=None,
|
|
allow_lr_injection=False,
|
|
**kwargs
|
|
):
|
|
rms_prop_optimizer = RmsPropOptimizer(alpha=base_learning_rate, **kwargs)
|
|
return _build(
|
|
model,
|
|
rms_prop_optimizer,
|
|
max_gradient_norm=max_gradient_norm,
|
|
allow_lr_injection=allow_lr_injection,
|
|
)
|