mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 00:21:07 +01:00
512 lines
17 KiB
Python
512 lines
17 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
from types import FunctionType
|
|
from functools import wraps
|
|
import six
|
|
|
|
from caffe2.python import cnn, dyndep, scope, workspace, core
|
|
from caffe2.proto import caffe2_pb2
|
|
|
|
dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")
|
|
|
|
|
|
DATAPARALLEL_OPS = [
|
|
"Conv",
|
|
"ConvTranspose",
|
|
"GroupConv",
|
|
"FC",
|
|
"FC_Decomp",
|
|
"FC_Prune",
|
|
"FC_Sparse",
|
|
"LRN",
|
|
"Dropout",
|
|
"MaxPool",
|
|
"AveragePool",
|
|
"Concat",
|
|
"DepthConcat",
|
|
"Relu",
|
|
"Transpose",
|
|
"SpatialBN",
|
|
"Accuracy",
|
|
"Adam",
|
|
"AveragedLoss",
|
|
"Cast",
|
|
"LabelCrossEntropy",
|
|
"LearningRate",
|
|
"Print",
|
|
"Scale",
|
|
"Snapshot",
|
|
"Softmax",
|
|
"StopGradient",
|
|
"Summarize",
|
|
"Sum",
|
|
"Tanh",
|
|
"WeightedSum",
|
|
"SquaredL2Distance",
|
|
]
|
|
|
|
|
|
class _GPUDataParallelMetaClass(type):
|
|
"""A meta class to patch method in order to distribute them over multiple
|
|
GPUs.
|
|
"""
|
|
_devices = []
|
|
|
|
@staticmethod
|
|
def _data_parallel_wrapper(op):
|
|
@wraps(op)
|
|
def wrapped(cls, blob_in, blob_out, *args, **kwargs):
|
|
# Helpers to extract a device specific blob or a global blob
|
|
def self_or_item(d, key):
|
|
if isinstance(d, dict):
|
|
assert key in d
|
|
return d[key]
|
|
return d
|
|
|
|
def get_input(gpu_id):
|
|
if isinstance(blob_in, list):
|
|
return [self_or_item(blob, gpu_id) for blob in blob_in]
|
|
return self_or_item(blob_in, gpu_id)
|
|
|
|
def get_output(gpu_id):
|
|
return self_or_item(blob_out, gpu_id)
|
|
|
|
# If we have explicit device scope, we do not parallelize
|
|
if cls.explicit_scope():
|
|
return op(
|
|
cls,
|
|
blob_in,
|
|
blob_out,
|
|
*args,
|
|
**kwargs)
|
|
|
|
devices = _GPUDataParallelMetaClass._devices
|
|
results = {}
|
|
for gpu_id in devices:
|
|
with core.NameScope("gpu_{}".format(gpu_id)):
|
|
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
|
with core.DeviceScope(device):
|
|
result = op(
|
|
cls,
|
|
get_input(gpu_id),
|
|
get_output(gpu_id),
|
|
*args,
|
|
**kwargs)
|
|
results[gpu_id] = result
|
|
return results
|
|
|
|
return wrapped
|
|
|
|
def __new__(meta, classname, bases, class_dict):
|
|
assert len(bases) == 1, "Expects only one base class"
|
|
base = bases[0]
|
|
assert base is cnn.CNNModelHelper, "Base class should be CNNModelHelper"
|
|
new_class_dict = {}
|
|
for name, attr in base.__dict__.items():
|
|
if name not in DATAPARALLEL_OPS:
|
|
continue
|
|
attr = _GPUDataParallelMetaClass._data_parallel_wrapper(attr)
|
|
new_class_dict[name] = attr
|
|
for name, attr in class_dict.items():
|
|
if name in new_class_dict:
|
|
continue
|
|
if isinstance(attr, FunctionType):
|
|
if name in DATAPARALLEL_OPS:
|
|
new_class_dict[name] = \
|
|
_GPUDataParallelMetaClass._data_parallel_wrapper(attr)
|
|
else:
|
|
new_class_dict[name] = attr
|
|
return super(_GPUDataParallelMetaClass, meta).__new__(
|
|
meta, classname, bases, new_class_dict)
|
|
|
|
|
|
@six.add_metaclass(_GPUDataParallelMetaClass)
|
|
class GPUDataParallelModel(cnn.CNNModelHelper):
|
|
"""A helper class that extends CNNModelHelper to support multi GPUs
|
|
data parallel training.
|
|
"""
|
|
def __init__(self, devices, *args, **kwargs):
|
|
assert len(devices) >= 1, "Should have at least 1 GPU devices"
|
|
assert len(devices) <= workspace.NumCudaDevices(), \
|
|
"Requested # of devices {} is greater than the # of GPUs {}".\
|
|
format(devices, workspace.NumCudaDevices())
|
|
_GPUDataParallelMetaClass._devices = devices
|
|
self._devices = devices
|
|
self._explicit_scope = False
|
|
self._gradient_reduce_all_added = False
|
|
self._mpi_comm = None
|
|
super(GPUDataParallelModel, self).__init__(*args, **kwargs)
|
|
|
|
def explicit_scope(self):
|
|
return self._explicit_scope
|
|
|
|
def _call(self, name, *args, **kwargs):
|
|
return super(GPUDataParallelModel, self).__getattr__(
|
|
name)(*args, **kwargs)
|
|
|
|
# TODO(denisy): try out decorators to avoid this code below
|
|
def Accuracy(self, *args, **kwargs):
|
|
return self._call("Accuracy", *args, **kwargs)
|
|
|
|
def Adam(self, *args, **kwargs):
|
|
return self._call("Adam", *args, **kwargs)
|
|
|
|
def AveragedLoss(self, *args, **kwargs):
|
|
return self._call("AveragedLoss", *args, **kwargs)
|
|
|
|
def Cast(self, *args, **kwargs):
|
|
return self._call("Cast", *args, **kwargs)
|
|
|
|
def LabelCrossEntropy(self, *args, **kwargs):
|
|
return self._call("LabelCrossEntropy", *args, **kwargs)
|
|
|
|
def LearningRate(self, *args, **kwargs):
|
|
return self._call("LearningRate", *args, **kwargs)
|
|
|
|
def Print(self, *args, **kwargs):
|
|
return self._call("Print", *args, **kwargs)
|
|
|
|
def Scale(self, *args, **kwargs):
|
|
return self._call("Scale", *args, **kwargs)
|
|
|
|
def Snapshot(self, *args, **kwargs):
|
|
return self._call("Snapshot", *args, **kwargs)
|
|
|
|
def Softmax(self, *args, **kwargs):
|
|
return self._call("Softmax", *args, **kwargs)
|
|
|
|
def StopGradient(self, *args, **kwargs):
|
|
return self._call("StopGradient", *args, **kwargs)
|
|
|
|
def Sum(self, *args, **kwargs):
|
|
return self._call("Sum", *args, **kwargs)
|
|
|
|
def Summarize(self, *args, **kwargs):
|
|
return self._call("Summarize", *args, **kwargs)
|
|
|
|
def Tanh(self, *args, **kwargs):
|
|
return self._call("Tanh", *args, **kwargs)
|
|
|
|
def WeightedSum(self, *args, **kwargs):
|
|
return self._call("WeightedSum", *args, **kwargs)
|
|
|
|
def SquaredL2Distance(self, *args, **kwargs):
|
|
return self._call("SquaredL2Distance", *args, **kwargs)
|
|
|
|
def SetMPIComm(self, mpi_comm):
|
|
self._mpi_comm = mpi_comm
|
|
|
|
def FinalizeSetup(self):
|
|
self.param_init_net.RunAllOnGPU()
|
|
self.RunAllOnGPU()
|
|
|
|
# If MPI enabled, broadcast params from master
|
|
if (self._mpi_comm is not None):
|
|
self._AddMPIParameterSync()
|
|
|
|
# Setup sync of initial params
|
|
self._SyncInitialParams()
|
|
|
|
def AddGradientOperators(self, params, *args, **kwargs):
|
|
def create_grad(param):
|
|
return self.ConstantFill(param, str(param) + "_grad", value=1.0)
|
|
|
|
param_grad = {}
|
|
# Explicitly need to create gradients on each GPU
|
|
for param in params:
|
|
if not isinstance(param, dict):
|
|
grad = create_grad(param)
|
|
param_grad[str(param)] = str(grad)
|
|
else:
|
|
for gpu_id in self._devices:
|
|
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
|
with core.DeviceScope(device):
|
|
assert gpu_id in param
|
|
p = param[gpu_id]
|
|
g = create_grad(p)
|
|
param_grad[str(p)] = str(g)
|
|
|
|
return super(GPUDataParallelModel, self).AddGradientOperators(
|
|
param_grad, *args, **kwargs)
|
|
|
|
def AddWeightDecay(self, weight_decay):
|
|
if weight_decay == 0.0:
|
|
return
|
|
|
|
assert(weight_decay > 0.0)
|
|
|
|
self._explicit_scope = True
|
|
assert \
|
|
self._gradient_reduce_all_added, \
|
|
"Weight decay must be done after gradient sync between gpus"
|
|
|
|
for gpu_id in self._devices:
|
|
with core.NameScope("gpu_{}".format(gpu_id)):
|
|
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
|
with core.DeviceScope(device):
|
|
wd = self.param_init_net.ConstantFill([], 'wd', shape=[1],
|
|
value=weight_decay)
|
|
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1],
|
|
value=1.0)
|
|
# Only update parameters that belong to the current GPU
|
|
params = self._CurrentScopeParams()
|
|
|
|
# Take only params that are weights
|
|
print("Adding weigth-decay for gpu {}.".format(gpu_id))
|
|
|
|
gpu_weights = [p for p in params if p in self.weights]
|
|
for w in gpu_weights:
|
|
# Equivalent to grad -= w * param
|
|
grad = self.param_to_grad[w]
|
|
self.net.WeightedSum([grad, ONE, w, wd], grad)
|
|
|
|
self._explicit_scope = False
|
|
|
|
def _Broadcast(self, net, param):
|
|
# TODO(akyrola): replace with NCCLBroadcast when it's working
|
|
# Copy params from gpu_0 to other
|
|
for gpu_idx in self._devices[1:]:
|
|
device_opt = core.DeviceOption(caffe2_pb2.CUDA, gpu_idx)
|
|
with core.DeviceScope(device_opt):
|
|
net.Copy(
|
|
"gpu_{}/{}".format(self._devices[0], param),
|
|
"gpu_{}/{}".format(gpu_idx, param)
|
|
)
|
|
|
|
def _SyncInitialParams(self):
|
|
unique_param_names = set(
|
|
stripParamName(p)
|
|
for p in self.params
|
|
)
|
|
|
|
self._explicit_scope = True
|
|
for param in unique_param_names:
|
|
self._Broadcast(self.param_init_net, param)
|
|
|
|
self._explicit_scope = False
|
|
|
|
def _AddMPIParameterSync(self):
|
|
# Sync from master
|
|
unique_param_names = set(
|
|
stripParamName(p)
|
|
for p in self.params
|
|
)
|
|
|
|
self._explicit_scope = True
|
|
|
|
# Should this be done in GPU 0 scope?
|
|
for param_name in unique_param_names:
|
|
param = "gpu_{}/{}".format(self._devices[0], param_name)
|
|
self.param_init_net.Broadcast(
|
|
inputs=[self._mpi_comm, param],
|
|
outputs=[param],
|
|
engine='MPI'
|
|
)
|
|
self._explicit_scope = False
|
|
|
|
def _AllReduceGradients(self):
|
|
self._gradient_reduce_all_added = True
|
|
|
|
if self._mpi_comm is None:
|
|
self._AllReduceGradientsSingleHost()
|
|
else:
|
|
self._AllReduceGradientsWithMPI()
|
|
|
|
def _AllReduceGradientsWithMPI(self):
|
|
self._explicit_scope = True
|
|
unique_grads_names = set(
|
|
stripParamName(grad)
|
|
for grad in self.param_to_grad.values()
|
|
)
|
|
|
|
# Step 1: sum gradients from local GPUs to master GPU
|
|
last_out = None
|
|
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
|
|
|
|
# Note: sorted order to ensure each host puts the operators in
|
|
# same order.
|
|
for grad_name in sorted(unique_grads_names):
|
|
grads_group = [
|
|
grad
|
|
for grad in self.param_to_grad.values()
|
|
if stripParamName(grad) == grad_name
|
|
]
|
|
master_grad = "gpu_{}/{}".format(self._devices[0], grad_name)
|
|
assert master_grad in grads_group
|
|
|
|
# Remark: NCCLReduce does not support in-place modifications
|
|
# so we need a temporary gradient blob
|
|
reduced_grad = "gpu_{}/{}_red".format(
|
|
self._devices[0],
|
|
grad_name
|
|
)
|
|
|
|
with core.DeviceScope(master_device_opt):
|
|
self.ConstantFill(master_grad, reduced_grad, value=0.0)
|
|
self.net.NCCLReduce(grads_group, reduced_grad)
|
|
|
|
# Step 2: allreduce over MPI to all hosts, between master GPUs
|
|
self.net.Allreduce(
|
|
inputs=[self._mpi_comm, reduced_grad],
|
|
outputs=[master_grad],
|
|
engine='MPI',
|
|
control_input=None if last_out is None else [last_out],
|
|
)
|
|
last_out = master_grad
|
|
|
|
# Step 3: broadcast locally
|
|
self._Broadcast(self.net, grad_name)
|
|
|
|
self._explicit_scope = False
|
|
|
|
def _AllReduceGradientsSingleHost(self):
|
|
"""Performs NCCL AllReduce to distribute gradients to all the GPUs."""
|
|
|
|
if len(self._devices) == 1:
|
|
return
|
|
|
|
# Take only params that have gradient associated with them.
|
|
unique_grads_names = set(
|
|
stripParamName(grad)
|
|
for grad in self.param_to_grad.values()
|
|
)
|
|
|
|
# Now we need to Allreduce gradients on all the GPUs.
|
|
# Pick GPU #0 as a master GPU.
|
|
self._explicit_scope = True
|
|
master_device_opt = core.DeviceOption(caffe2_pb2.CUDA, self._devices[0])
|
|
with core.DeviceScope(master_device_opt):
|
|
# Group by grads for reduce.
|
|
for grad_name in unique_grads_names:
|
|
grads_group = [
|
|
grad
|
|
for grad in self.param_to_grad.values()
|
|
if stripParamName(grad) == grad_name
|
|
]
|
|
assert len(grads_group) == len(self._devices), \
|
|
"Each GPU from {}, should have a copy of {}.".format(
|
|
self._devices, grad_name)
|
|
self.NCCLAllreduce(grads_group, grads_group)
|
|
self._explicit_scope = False
|
|
|
|
def _BuildLR(self, base_lr, policy="fixed", **other_lr_params):
|
|
"""A helper to create learning rate."""
|
|
ITER = self.Iter("ITER")
|
|
# There is one interesting thing here: since we are minimizing, we are
|
|
# doing "descent" so the learning rate is set to be negative.
|
|
LR = self.net.LearningRate(
|
|
[ITER],
|
|
"LR",
|
|
base_lr=base_lr,
|
|
policy=policy,
|
|
**other_lr_params
|
|
)
|
|
return LR
|
|
|
|
def _BuildSGD(self, params, base_lr, policy="fixed", **other_lr_params):
|
|
"""A helper to construct gradient update for SGD."""
|
|
base_lr = base_lr / len(self._devices)
|
|
LR = self._BuildLR(base_lr, policy, **other_lr_params)
|
|
ONE = self.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
|
|
for param in params:
|
|
grad = self.param_to_grad[param]
|
|
if isinstance(grad, core.GradientSlice):
|
|
self.ScatterWeightedSum(
|
|
[param, ONE, grad.indices, grad.values, LR], param
|
|
)
|
|
else:
|
|
self.WeightedSum([param, ONE, grad, LR], param)
|
|
|
|
def _CurrentScopeParams(self):
|
|
return [
|
|
param
|
|
for param in self.param_to_grad.keys()
|
|
if str(param).startswith(scope.NAMESCOPE)
|
|
]
|
|
|
|
def SGD(self, base_lr, policy="fixed", **other_lr_params):
|
|
"""Adds SGD optimizer to the model."""
|
|
self._AllReduceGradients()
|
|
|
|
# Create update params operators.
|
|
self._explicit_scope = True
|
|
for gpu_id in self._devices:
|
|
with core.NameScope("gpu_{}".format(gpu_id)):
|
|
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
|
with core.DeviceScope(device):
|
|
# Only update parameters that belong to the current GPU
|
|
params = self._CurrentScopeParams()
|
|
|
|
# Add optimizer update operators
|
|
self._BuildSGD(params, base_lr, policy, **other_lr_params)
|
|
self._explicit_scope = False
|
|
|
|
def CustomSGD(
|
|
self,
|
|
paramup_build_fn,
|
|
base_lr,
|
|
lr_policy,
|
|
weight_decay,
|
|
**other_lr_pars
|
|
):
|
|
"""Custom parameter update function"""
|
|
self._AllReduceGradients()
|
|
|
|
self.AddWeightDecay(weight_decay)
|
|
|
|
# Run parameter update on each machine
|
|
self._explicit_scope = True
|
|
for gpu_id in self._devices:
|
|
with core.NameScope("gpu_{}".format(gpu_id)):
|
|
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
|
with core.DeviceScope(device):
|
|
LR = self._BuildLR(base_lr, lr_policy, **other_lr_pars)
|
|
|
|
params = self._CurrentScopeParams()
|
|
paramup_build_fn(self, params, LR)
|
|
self._explicit_scope = False
|
|
|
|
def ExecOnEachDevice(self, fn, *args, **kwargs):
|
|
self._explicit_scope = True
|
|
for gpu_id in self._devices:
|
|
with core.NameScope("gpu_{}".format(gpu_id)):
|
|
device = core.DeviceOption(caffe2_pb2.CUDA, gpu_id)
|
|
with core.DeviceScope(device):
|
|
fn(self, *args, **kwargs)
|
|
|
|
self._explicit_scope = False
|
|
|
|
|
|
# A helper function to extract a parameter's name
|
|
def stripParamName(param):
|
|
# Format is "a/b/c/d" -> d
|
|
name = str(param)
|
|
sep = scope._NAMESCOPE_SEPARATOR
|
|
return name[name.rindex(sep) + 1:]
|
|
|
|
|
|
def SetupMPICluster(num_replicas, role, job_path):
|
|
from caffe2.python import mpi
|
|
print("Initing library")
|
|
dyndep.InitOpsLibrary('@/caffe2/caffe2/mpi:mpi_ops')
|
|
print("Setup peers")
|
|
mpi.SetupPeers(
|
|
replicas=int(num_replicas),
|
|
role=role,
|
|
job_path=job_path
|
|
)
|
|
print("Create mpi_init net")
|
|
mpi_init_net = core.Net('mpi_init')
|
|
print("Create commonworld")
|
|
mpi_comm = mpi_init_net.CreateCommonWorld(
|
|
inputs=[],
|
|
outputs=['comm_world'],
|
|
engine='MPI'
|
|
)
|
|
print("Run mpi_init net")
|
|
workspace.RunNetOnce(mpi_init_net)
|
|
print("Finished MPI setup")
|
|
return mpi_comm
|