mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
Summary: Move the open source version of build_ftrl to the open source directory. Because build_ftrl can use several engines, the SIMD engine is fb specific. We keep the build_ftrl in the fb/optimizers/sgd.py file. So, if the caller only uses the open source engine, it can import the open source build_ftrl. If the caller may use the SIMD engine, it needs to import the fb specific build_ftrl. Also move the tests to python directory. Reviewed By: salexspb Differential Revision: D4560384 fbshipit-source-id: 84fc915d3bbe42fd19503ef132d3277088f6fab3
125 lines
4.4 KiB
Python
125 lines
4.4 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
from caffe2.python import core
|
|
from caffe2.proto import caffe2_pb2
|
|
|
|
|
|
def _build_lr(model, base_learning_rate, policy="fixed", iter_val=0,
|
|
**other_lr_params):
|
|
|
|
# Add training operators.
|
|
with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
|
|
ITER = model.param_init_net.ConstantFill([], "ITER", shape=[1],
|
|
value=iter_val,
|
|
dtype=core.DataType.INT32)
|
|
|
|
model.net.Iter(ITER, ITER)
|
|
|
|
# There is one interesting thing here: since we are minimizing, we are
|
|
# doing "descent" so the learning rate is set to be negative.
|
|
LR = model.net.LearningRate(
|
|
[ITER],
|
|
"LR",
|
|
base_lr=-base_learning_rate,
|
|
policy=policy,
|
|
**other_lr_params
|
|
)
|
|
return LR, ITER
|
|
|
|
|
|
def _dedup(model, dedup_indices, grad):
|
|
assert (isinstance(grad, core.GradientSlice))
|
|
# TODO(dzhulgakov): find a better place to do deduplication
|
|
if dedup_indices:
|
|
return model.net.DeduplicateGradientSlices(grad)
|
|
else:
|
|
return grad
|
|
|
|
|
|
def build_sgd(model, base_learning_rate, policy="fixed", **other_lr_params):
|
|
LR, _ = _build_lr(model, base_learning_rate, policy, **other_lr_params)
|
|
|
|
ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
|
|
for param, grad in model.GetOptimizationPairs().items():
|
|
if isinstance(grad, core.GradientSlice):
|
|
model.ScatterWeightedSum(
|
|
[param, ONE, grad.indices, grad.values, LR], param
|
|
)
|
|
else:
|
|
model.WeightedSum([param, ONE, grad, LR], param)
|
|
|
|
|
|
def build_ftrl(model, dedup_indices=False, engine="SIMD", **params):
|
|
if engine == "SIMD":
|
|
assert core.IsOperator('Ftrl_ENGINE_SIMD')
|
|
assert core.IsOperator('SparseFtrl_ENGINE_SIMD')
|
|
for param, grad in model.GetOptimizationPairs().items():
|
|
# allocate additional args of the same shape as main weights
|
|
nz = model.param_init_net.ConstantFill(
|
|
[param],
|
|
param + "_ftrl_nz",
|
|
extra_shape=[2],
|
|
value=0.0
|
|
)
|
|
if isinstance(grad, core.GradientSlice):
|
|
g = _dedup(model, dedup_indices, grad)
|
|
model.SparseFtrl([param, nz, g.indices, g.values],
|
|
[param, nz], engine=engine, **params)
|
|
else:
|
|
model.Ftrl([param, nz, grad], [param, nz], engine=engine, **params)
|
|
|
|
|
|
def build_adagrad(model, base_learning_rate, dedup_indices=False,
|
|
parameters=None, **params):
|
|
LR, _ = _build_lr(model, base_learning_rate, policy="fixed")
|
|
param_to_grad = model.GetOptimizationPairs(parameters)
|
|
|
|
for param, grad in param_to_grad.items():
|
|
# allocate additional args of the same shape as main weights
|
|
moment = model.param_init_net.ConstantFill(
|
|
[param],
|
|
param + "_square_sum",
|
|
value=0.0
|
|
)
|
|
if isinstance(grad, core.GradientSlice):
|
|
g = _dedup(model, dedup_indices, grad)
|
|
model.SparseAdagrad(
|
|
[param, moment, g.indices, g.values, LR], [param, moment],
|
|
**params
|
|
)
|
|
|
|
else:
|
|
model.Adagrad([param, moment, grad, LR], [param, moment], **params)
|
|
|
|
|
|
def build_adam(model, base_learning_rate, dedup_indices=False, iter_val=0,
|
|
**params):
|
|
LR, ITER = _build_lr(model, base_learning_rate, policy="fixed",
|
|
iter_val=iter_val)
|
|
for param, grad in model.GetOptimizationPairs().items():
|
|
# allocate additional args of the same shape as main weights
|
|
# TODO(nvivek): Fuse input moments if perf critical.
|
|
# Currently keeping it separate to keep the math cleaner
|
|
m1 = model.param_init_net.ConstantFill(
|
|
[param],
|
|
param + "_first_moment",
|
|
value=0.0
|
|
)
|
|
m2 = model.param_init_net.ConstantFill(
|
|
[param],
|
|
param + "_second_moment",
|
|
value=0.0
|
|
)
|
|
if isinstance(grad, core.GradientSlice):
|
|
g = _dedup(model, dedup_indices, grad)
|
|
model.SparseAdam(
|
|
[param, m1, m2, g.indices, g.values, LR, ITER], [param, m1, m2],
|
|
**params
|
|
)
|
|
|
|
else:
|
|
model.Adam([param, m1, m2, grad, LR, ITER], [param, m1, m2],
|
|
**params)
|