pytorch/caffe2/python/optimizer.py
Huazhong Ning 83437853ad refactor and modulize optimizers
Summary:
The current optimizer code in c2/python has the following issues:
(1) the optimizers in sgd.py cannot config per param-blob optimizer;
(2) sgd.py is a bad file name. optimizer.py is a better name;
(3) layer_model_helper.py has another set of optimizer code (which supports per param-blob optimizer)

This diff did the following
(1) create optimizer objects so that we can config per param-blob optimizer and that are also compatible to the existing optimizer code
(2) the new optimizer code are much more modulized
(3) move the optimizer code to file with better name (optimizer.py)
(4) replace the optimizer imports in the existing code

will do in next diffs
(1) optimizers with structured parameters for dper2
(2) get rid of the optimizer code in layer_model_helper.py

Reviewed By: salexspb

Differential Revision: D4609013

fbshipit-source-id: 2e2d6dfa8685d10498f89069157453d9feca3f27
2017-03-07 18:46:47 -08:00

268 lines
8.6 KiB
Python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from caffe2.python import core
from caffe2.proto import caffe2_pb2
class Optimizer(object):
def __init__(self):
pass
def __call__(self, net, param_init_net, param, grad):
raise NotImplementedError()
@staticmethod
def build_lr(net, param_init_net, base_learning_rate,
learning_rate_blob="lr", policy="fixed",
iter_val=0, **kwargs):
# Add training operators.
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
iterator = param_init_net.ConstantFill(
[], "iterator", shape=[1],
value=iter_val,
dtype=core.DataType.INT32)
net.Iter(iterator, iterator)
# There is one interesting thing here: since we are minimizing, we are
# doing "descent" so the learning rate is set to be negative.
lr = net.LearningRate(
[iterator],
learning_rate_blob,
base_lr=-base_learning_rate,
policy=policy,
**kwargs
)
return lr, iterator
@staticmethod
def dedup(net, sparse_dedup_aggregator, grad):
assert (isinstance(grad, core.GradientSlice))
if sparse_dedup_aggregator:
return net.DeduplicateGradientSlices(
grad, aggregator=sparse_dedup_aggregator)
else:
return grad
class SgdOptimizer(Optimizer):
def __init__(self, base_learning_rate=0.01, policy='fixed',
momentum=0.0, **kwargs):
self.base_learning_rate = base_learning_rate
self.policy = policy
self.momentum = momentum
self.init_kwargs = kwargs
def __call__(self, net, param_init_net, param, grad):
if self.base_learning_rate <= 0:
return
lr, _ = self.build_lr(
net, param_init_net,
base_learning_rate=self.base_learning_rate,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
ONE = param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
if self.momentum > 0:
momentum_data = param_init_net.ConstantFill(
param, str(param) + "_momentum", value=0.)
if isinstance(grad, core.GradientSlice):
assert self.momentum == 0., "Doesn't support momentum for sparse"
net.ScatterWeightedSum(
[param, ONE, grad.indices, grad.values, lr],
param
)
else:
if self.momentum > 0.:
net.MomentumSGD(
[grad, momentum_data, lr], [grad, momentum_data],
momentum=self.momentum,
nesterov=1)
coeff = ONE
else:
coeff = lr
net.WeightedSum(
[param, ONE, grad, coeff],
param
)
class AdagradOptimizer(Optimizer):
def __init__(self, alpha=0.01, epsilon=1e-4, policy="fixed",
sparse_dedup_aggregator=None, engine='', **kwargs):
self.alpha = alpha
self.epsilon = epsilon
self.policy = policy
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
self.init_kwargs = kwargs
def __call__(self, net, param_init_net, param, grad):
if self.alpha <= 0:
return
lr, _ = self.build_lr(
net, param_init_net,
base_learning_rate=self.alpha,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
param_square_sum = param_init_net.ConstantFill(
[param],
str(param) + "_square_sum",
value=0.0
)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseAdagrad(
[param, param_square_sum, grad.indices, grad.values, lr],
[param, param_square_sum],
epsilon=self.epsilon,
engine=self.engine
)
else:
net.Adagrad(
[param, param_square_sum, grad, lr],
[param, param_square_sum],
epsilon=self.epsilon,
engine=self.engine
)
class FtrlOptimizer(Optimizer):
def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
sparse_dedup_aggregator=None, engine=''):
self.alpha = alpha
self.beta = beta
self.lambda1 = lambda1
self.lambda2 = lambda2
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
def __call__(self, net, param_init_net, param, grad):
if self.alpha <= 0:
return
nz = param_init_net.ConstantFill(
[param],
str(param) + "_ftrl_nz",
extra_shape=[2],
value=0.0
)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseFtrl(
[param, nz, grad.indices, grad.values],
[param, nz],
engine=self.engine,
alpha=self.alpha,
beta=self.beta,
lambda1=self.lambda1,
lambda2=self.lambda2
)
else:
net.Ftrl(
[param, nz, grad],
[param, nz],
engine=self.engine,
alpha=self.alpha,
beta=self.beta,
lambda1=self.lambda1,
lambda2=self.lambda2
)
class AdamOptimizer(Optimizer):
def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
policy='fixed', sparse_dedup_aggregator=None,
engine='', **kwargs):
self.alpha = alpha
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.policy = policy
self.sparse_dedup_aggregator = sparse_dedup_aggregator
self.engine = engine
self.init_kwargs = kwargs
def __call__(self, net, param_init_net, param, grad):
if self.alpha <= 0:
return
lr, iterator = self.build_lr(
net, param_init_net,
base_learning_rate=self.alpha,
learning_rate_blob=str(param) + "_lr",
policy=self.policy,
**(self.init_kwargs)
)
m1 = param_init_net.ConstantFill(
[param],
param + "_first_moment",
value=0.0
)
m2 = param_init_net.ConstantFill(
[param],
param + "_second_moment",
value=0.0
)
if isinstance(grad, core.GradientSlice):
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
net.SparseAdam(
[param, m1, m2, grad.indices, grad.values, lr, iterator],
[param, m1, m2],
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon
)
else:
net.Adam(
[param, m1, m2, grad, lr, iterator],
[param, m1, m2],
beta1=self.beta1,
beta2=self.beta2,
epsilon=self.epsilon)
def build_sgd(model, base_learning_rate, **kwargs):
sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
for param, grad in model.GetOptimizationPairs().items():
sgd_optimizer(model.net, model.param_init_net, param, grad)
def build_ftrl(model, engine="SIMD", **kwargs):
if engine == "SIMD":
assert core.IsOperator('Ftrl_ENGINE_SIMD')
assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
for param, grad in model.GetOptimizationPairs().items():
ftrl_optimizer(model.net, model.param_init_net, param, grad)
def build_adagrad(model, base_learning_rate, parameters=None, **kwargs):
adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
param_to_grad = model.GetOptimizationPairs(parameters)
for param, grad in param_to_grad.items():
adagrad_optimizer(model.net, model.param_init_net, param, grad)
def build_adam(model, base_learning_rate, **kwargs):
adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
for param, grad in model.GetOptimizationPairs().items():
adam_optimizer(model.net, model.param_init_net, param, grad)