pytorch/caffe2/python/optimizer.py
Bor-Yiing Su 7270471ed6 Returns auxiliary parameters in the optimizers.
Summary:
1. Adds a function to return auxiliary parameters for each optimizer. This function can be used to serialize the optimizers so that they can be recovered.
2. Fixes the bug that the iteration blob is not incremented by one in each iteration. Suppose there are k parameters using the adam learning rate optimizer, the iteration blob is incremented by k based on the original implementation.

Reviewed By: azzolini

Differential Revision: D4872397

fbshipit-source-id: d86711feedda2ba83af5f2a18141b06a6a473733
2017-04-17 10:16:32 -07:00

311 lines
11 KiB
Python

## @package optimizer
# Module caffe2.python.optimizer
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import namedtuple
from caffe2.python import core
from caffe2.proto import caffe2_pb2
_OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
class Optimizer(object):
def __init__(self):
AuxParams = namedtuple("AuxParams", ["local", "shared"])
self._aux_params = AuxParams(local=[], shared=[])
def __call__(self, net, param_init_net, param, grad):
raise NotImplementedError()
@staticmethod
def build_lr(net, param_init_net, base_learning_rate,
learning_rate_blob="lr", policy="fixed",
iter_val=0, **kwargs):
if not param_init_net.BlobIsDefined(_OPTIMIZER_ITERATION_NAME):
# Add training operators.
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
iteration = param_init_net.ConstantFill(
[], _OPTIMIZER_ITERATION_NAME, shape=[1],
value=iter_val,
dtype=core.DataType.INT32)
iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
net.AtomicIter([iter_mutex, iteration], [iteration])
else:
iteration = param_init_net.GetBlobRef(_OPTIMIZER_ITERATION_NAME)
# There is one interesting thing here: since we are minimizing, we are
# doing "descent" so the learning rate is set to be negative.
lr = net.LearningRate(
[iteration],
learning_rate_blob,
base_lr=-base_learning_rate,
policy=policy,
**kwargs
)
return lr, iteration
@staticmethod
def dedup(net, sparse_dedup_aggregator, grad):
assert (isinstance(grad, core.GradientSlice))
if sparse_dedup_aggregator:
return net.DeduplicateGradientSlices(
grad, aggregator=sparse_dedup_aggregator)
else:
return grad
def get_auxiliary_parameters(self):
"""Returns a list of auxiliary parameters.
Returns:
aux_params: A namedtuple, AuxParams.
aux_params.local stores a list of blobs. Each blob is a local
auxiliary parameter. A local auxiliary parameter is a parameter in
parallel to a learning rate parameter. Take adagrad as an example,
the local auxiliary parameter is the squared sum parameter, because
every learning rate has a squared sum associated with it.
aux_params.shared also stores a list of blobs. Each blob is a shared
auxiliary parameter. A shared auxiliary parameter is a parameter
that is shared across all the learning rate parameters. Take adam as
an example, the iteration parameter is a shared parameter, because
all the learning rates share the same iteration parameter.
"""
return self._aux_params
class SgdOptimizer(Optimizer):
def __init__(self, base_learning_rate=0.01, policy='fixed',
momentum=0.0, **kwargs):
super(SgdOptimizer, self).__init__()
self.base_learning_rate = base_learning_rate
self.policy = policy
self.momentum = momentum
self.init_kwargs = kwargs
def __call__(self, net, param_init_net, param, grad):
if self.base_learning_rate <= 0:
return
lr, _ = self.build_lr(
net, param_init_net,
base_learning_rate=self.base_learning_rate,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
ONE = param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
self._aux_params.shared.append(ONE)
if self.momentum > 0:
momentum_data = param_init_net.ConstantFill(
param, str(param) + "_momentum", value=0.)
self._aux_params.local.append(momentum_data)
if isinstance(grad, core.GradientSlice):
assert self.momentum == 0., "Doesn't support momentum for sparse"
net.ScatterWeightedSum(
[param, ONE, grad.indices, grad.values, lr],
param
)
else:
if self.momentum > 0.:
net.MomentumSGD(
[grad, momentum_data, lr], [grad, momentum_data],
momentum=self.momentum,
nesterov=1)
coeff = ONE
else:
coeff = lr
net.WeightedSum(
[param, ONE, grad, coeff],
param
)
class AdagradOptimizer(Optimizer):
def __init__(self, alpha=0.01, epsilon=1e-4, policy="fixed",
sparse_dedup_aggregator=None, engine='', **kwargs):
super(AdagradOptimizer, self).__init__()
self.alpha = alpha
self.epsilon = epsilon
self.policy = policy
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
self.init_kwargs = kwargs
def __call__(self, net, param_init_net, param, grad):
if self.alpha <= 0:
return
lr, _ = self.build_lr(
net, param_init_net,
base_learning_rate=self.alpha,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
param_squared_sum = param_init_net.ConstantFill(
[param],
str(param) + "_squared_sum",
value=0.0
)
self._aux_params.local.append(param_squared_sum)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseAdagrad(
[param, param_squared_sum, grad.indices, grad.values, lr],
[param, param_squared_sum],
epsilon=self.epsilon,
engine=self.engine
)
else:
net.Adagrad(
[param, param_squared_sum, grad, lr],
[param, param_squared_sum],
epsilon=self.epsilon,
engine=self.engine
)
class FtrlOptimizer(Optimizer):
def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
sparse_dedup_aggregator=None, engine=''):
super(FtrlOptimizer, self).__init__()
self.alpha = alpha
self.beta = beta
self.lambda1 = lambda1
self.lambda2 = lambda2
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
def __call__(self, net, param_init_net, param, grad):
if self.alpha <= 0:
return
nz = param_init_net.ConstantFill(
[param],
str(param) + "_ftrl_nz",
extra_shape=[2],
value=0.0
)
self._aux_params.local.append(nz)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseFtrl(
[param, nz, grad.indices, grad.values],
[param, nz],
engine=self.engine,
alpha=self.alpha,
beta=self.beta,
lambda1=self.lambda1,
lambda2=self.lambda2
)
else:
net.Ftrl(
[param, nz, grad],
[param, nz],
engine=self.engine,
alpha=self.alpha,
beta=self.beta,
lambda1=self.lambda1,
lambda2=self.lambda2
)
class AdamOptimizer(Optimizer):
def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
policy='fixed', sparse_dedup_aggregator=None,
engine='', **kwargs):
super(AdamOptimizer, self).__init__()
self.alpha = alpha
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.policy = policy
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
self.init_kwargs = kwargs
def __call__(self, net, param_init_net, param, grad):
if self.alpha <= 0:
return
lr, iteration = self.build_lr(
net, param_init_net,
base_learning_rate=self.alpha,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
m1 = param_init_net.ConstantFill(
[param],
param + "_first_moment",
value=0.0
)
m2 = param_init_net.ConstantFill(
[param],
param + "_second_moment",
value=0.0
)
self._aux_params.shared.append(iteration)
self._aux_params.local.append(m1)
self._aux_params.local.append(m2)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseAdam(
[param, m1, m2, grad.indices, grad.values, lr, iteration],
[param, m1, m2],
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon
)
else:
net.Adam(
[param, m1, m2, grad, lr, iteration],
[param, m1, m2],
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon)
def build_sgd(model, base_learning_rate, **kwargs):
sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
for param, grad in model.GetOptimizationPairs().items():
sgd_optimizer(model.net, model.param_init_net, param, grad)
return sgd_optimizer
def build_ftrl(model, engine="SIMD", **kwargs):
if engine == "SIMD":
assert core.IsOperator('Ftrl_ENGINE_SIMD')
assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
for param, grad in model.GetOptimizationPairs().items():
ftrl_optimizer(model.net, model.param_init_net, param, grad)
return ftrl_optimizer
def build_adagrad(model, base_learning_rate, parameters=None, **kwargs):
adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
param_to_grad = model.GetOptimizationPairs(parameters)
for param, grad in param_to_grad.items():
adagrad_optimizer(model.net, model.param_init_net, param, grad)
return adagrad_optimizer
def build_adam(model, base_learning_rate, **kwargs):
adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
for param, grad in model.GetOptimizationPairs().items():
adam_optimizer(model.net, model.param_init_net, param, grad)
return adam_optimizer