mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: The current optimizer code in c2/python has the following issues: (1) the optimizers in sgd.py cannot config per param-blob optimizer; (2) sgd.py is a bad file name. optimizer.py is a better name; (3) layer_model_helper.py has another set of optimizer code (which supports per param-blob optimizer) This diff did the following (1) create optimizer objects so that we can config per param-blob optimizer and that are also compatible to the existing optimizer code (2) the new optimizer code are much more modulized (3) move the optimizer code to file with better name (optimizer.py) (4) replace the optimizer imports in the existing code will do in next diffs (1) optimizers with structured parameters for dper2 (2) get rid of the optimizer code in layer_model_helper.py Reviewed By: salexspb Differential Revision: D4609013 fbshipit-source-id: 2e2d6dfa8685d10498f89069157453d9feca3f27
268 lines
8.6 KiB
Python
268 lines
8.6 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
from __future__ import unicode_literals
|
|
|
|
from caffe2.python import core
|
|
from caffe2.proto import caffe2_pb2
|
|
|
|
|
|
class Optimizer(object):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def __call__(self, net, param_init_net, param, grad):
|
|
raise NotImplementedError()
|
|
|
|
@staticmethod
|
|
def build_lr(net, param_init_net, base_learning_rate,
|
|
learning_rate_blob="lr", policy="fixed",
|
|
iter_val=0, **kwargs):
|
|
# Add training operators.
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
|
iterator = param_init_net.ConstantFill(
|
|
[], "iterator", shape=[1],
|
|
value=iter_val,
|
|
dtype=core.DataType.INT32)
|
|
|
|
net.Iter(iterator, iterator)
|
|
|
|
# There is one interesting thing here: since we are minimizing, we are
|
|
# doing "descent" so the learning rate is set to be negative.
|
|
lr = net.LearningRate(
|
|
[iterator],
|
|
learning_rate_blob,
|
|
base_lr=-base_learning_rate,
|
|
policy=policy,
|
|
**kwargs
|
|
)
|
|
return lr, iterator
|
|
|
|
@staticmethod
|
|
def dedup(net, sparse_dedup_aggregator, grad):
|
|
assert (isinstance(grad, core.GradientSlice))
|
|
if sparse_dedup_aggregator:
|
|
return net.DeduplicateGradientSlices(
|
|
grad, aggregator=sparse_dedup_aggregator)
|
|
else:
|
|
return grad
|
|
|
|
|
|
class SgdOptimizer(Optimizer):
|
|
def __init__(self, base_learning_rate=0.01, policy='fixed',
|
|
momentum=0.0, **kwargs):
|
|
self.base_learning_rate = base_learning_rate
|
|
self.policy = policy
|
|
self.momentum = momentum
|
|
self.init_kwargs = kwargs
|
|
|
|
def __call__(self, net, param_init_net, param, grad):
|
|
if self.base_learning_rate <= 0:
|
|
return
|
|
|
|
lr, _ = self.build_lr(
|
|
net, param_init_net,
|
|
base_learning_rate=self.base_learning_rate,
|
|
learning_rate_blob=str(param) + "_lr",
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
ONE = param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
|
|
|
|
if self.momentum > 0:
|
|
momentum_data = param_init_net.ConstantFill(
|
|
param, str(param) + "_momentum", value=0.)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
assert self.momentum == 0., "Doesn't support momentum for sparse"
|
|
net.ScatterWeightedSum(
|
|
[param, ONE, grad.indices, grad.values, lr],
|
|
param
|
|
)
|
|
else:
|
|
if self.momentum > 0.:
|
|
net.MomentumSGD(
|
|
[grad, momentum_data, lr], [grad, momentum_data],
|
|
momentum=self.momentum,
|
|
nesterov=1)
|
|
coeff = ONE
|
|
else:
|
|
coeff = lr
|
|
|
|
net.WeightedSum(
|
|
[param, ONE, grad, coeff],
|
|
param
|
|
)
|
|
|
|
|
|
class AdagradOptimizer(Optimizer):
|
|
def __init__(self, alpha=0.01, epsilon=1e-4, policy="fixed",
|
|
sparse_dedup_aggregator=None, engine='', **kwargs):
|
|
self.alpha = alpha
|
|
self.epsilon = epsilon
|
|
self.policy = policy
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.engine = engine
|
|
self.init_kwargs = kwargs
|
|
|
|
def __call__(self, net, param_init_net, param, grad):
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
lr, _ = self.build_lr(
|
|
net, param_init_net,
|
|
base_learning_rate=self.alpha,
|
|
learning_rate_blob=str(param) + "_lr",
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
param_square_sum = param_init_net.ConstantFill(
|
|
[param],
|
|
str(param) + "_square_sum",
|
|
value=0.0
|
|
)
|
|
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
net.SparseAdagrad(
|
|
[param, param_square_sum, grad.indices, grad.values, lr],
|
|
[param, param_square_sum],
|
|
epsilon=self.epsilon,
|
|
engine=self.engine
|
|
)
|
|
else:
|
|
net.Adagrad(
|
|
[param, param_square_sum, grad, lr],
|
|
[param, param_square_sum],
|
|
epsilon=self.epsilon,
|
|
engine=self.engine
|
|
)
|
|
|
|
|
|
class FtrlOptimizer(Optimizer):
|
|
def __init__(self, alpha=0.01, beta=1e-4, lambda1=0, lambda2=0,
|
|
sparse_dedup_aggregator=None, engine=''):
|
|
self.alpha = alpha
|
|
self.beta = beta
|
|
self.lambda1 = lambda1
|
|
self.lambda2 = lambda2
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.engine = engine
|
|
|
|
def __call__(self, net, param_init_net, param, grad):
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
nz = param_init_net.ConstantFill(
|
|
[param],
|
|
str(param) + "_ftrl_nz",
|
|
extra_shape=[2],
|
|
value=0.0
|
|
)
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
net.SparseFtrl(
|
|
[param, nz, grad.indices, grad.values],
|
|
[param, nz],
|
|
engine=self.engine,
|
|
alpha=self.alpha,
|
|
beta=self.beta,
|
|
lambda1=self.lambda1,
|
|
lambda2=self.lambda2
|
|
)
|
|
else:
|
|
net.Ftrl(
|
|
[param, nz, grad],
|
|
[param, nz],
|
|
engine=self.engine,
|
|
alpha=self.alpha,
|
|
beta=self.beta,
|
|
lambda1=self.lambda1,
|
|
lambda2=self.lambda2
|
|
)
|
|
|
|
|
|
class AdamOptimizer(Optimizer):
|
|
def __init__(self, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
|
|
policy='fixed', sparse_dedup_aggregator=None,
|
|
engine='', **kwargs):
|
|
self.alpha = alpha
|
|
self.beta1 = beta1
|
|
self.beta2 = beta2
|
|
self.epsilon = epsilon
|
|
self.policy = policy
|
|
self.sparse_dedup_aggregator = sparse_dedup_aggregator
|
|
self.engine = engine
|
|
self.init_kwargs = kwargs
|
|
|
|
def __call__(self, net, param_init_net, param, grad):
|
|
if self.alpha <= 0:
|
|
return
|
|
|
|
lr, iterator = self.build_lr(
|
|
net, param_init_net,
|
|
base_learning_rate=self.alpha,
|
|
learning_rate_blob=str(param) + "_lr",
|
|
policy=self.policy,
|
|
**(self.init_kwargs)
|
|
)
|
|
|
|
m1 = param_init_net.ConstantFill(
|
|
[param],
|
|
param + "_first_moment",
|
|
value=0.0
|
|
)
|
|
m2 = param_init_net.ConstantFill(
|
|
[param],
|
|
param + "_second_moment",
|
|
value=0.0
|
|
)
|
|
if isinstance(grad, core.GradientSlice):
|
|
grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
|
|
net.SparseAdam(
|
|
[param, m1, m2, grad.indices, grad.values, lr, iterator],
|
|
[param, m1, m2],
|
|
beta1=self.beta1,
|
|
beta2=self.beta2,
|
|
epsilon=self.epsilon
|
|
)
|
|
|
|
else:
|
|
net.Adam(
|
|
[param, m1, m2, grad, lr, iterator],
|
|
[param, m1, m2],
|
|
beta1=self.beta1,
|
|
beta2=self.beta2,
|
|
epsilon=self.epsilon)
|
|
|
|
|
|
def build_sgd(model, base_learning_rate, **kwargs):
|
|
sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
|
|
for param, grad in model.GetOptimizationPairs().items():
|
|
sgd_optimizer(model.net, model.param_init_net, param, grad)
|
|
|
|
|
|
def build_ftrl(model, engine="SIMD", **kwargs):
|
|
if engine == "SIMD":
|
|
assert core.IsOperator('Ftrl_ENGINE_SIMD')
|
|
assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
|
|
ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
|
|
for param, grad in model.GetOptimizationPairs().items():
|
|
ftrl_optimizer(model.net, model.param_init_net, param, grad)
|
|
|
|
|
|
def build_adagrad(model, base_learning_rate, parameters=None, **kwargs):
|
|
adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
|
|
param_to_grad = model.GetOptimizationPairs(parameters)
|
|
|
|
for param, grad in param_to_grad.items():
|
|
adagrad_optimizer(model.net, model.param_init_net, param, grad)
|
|
|
|
|
|
def build_adam(model, base_learning_rate, **kwargs):
|
|
adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
|
|
for param, grad in model.GetOptimizationPairs().items():
|
|
adam_optimizer(model.net, model.param_init_net, param, grad)
|