pytorch/caffe2/python/data_parallel_model_utils.py
Orion Reblitz-Richardson 1d5780d42c Remove Apache headers from source.
* LICENSE file contains details, so removing from individual source files.
2018-03-27 13:10:18 -07:00

95 lines
3.7 KiB
Python

## @package data_parallel_model_utils
# Module caffe2.python.data_parallel_model_utils
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from future.utils import viewitems, viewkeys, viewvalues
import logging
from caffe2.python import core
from caffe2.python.data_parallel_model import stripBlobName
log = logging.getLogger("data_parallel_model_utils")
log.setLevel(logging.INFO)
def GetActivationBlobs(model):
# Hacky way to get activations, think of a better way
activations = []
first_gpu_prefix = "{}_{}/".format(model._device_prefix, model._devices[0])
all_inputs = set()
for op in model.net.Proto().op:
for inp in op.input:
all_inputs.add(inp)
params = set(model.GetParams(''))
for op in model.net.Proto().op:
for b in op.output:
if b.startswith(first_gpu_prefix) and not b.endswith("_grad"):
if b in all_inputs and b not in params and b + "_grad" in all_inputs:
activations.append(stripBlobName(b))
return activations
def _ShiftActivationDevices(model, activations, from_device, to_device):
prefix = "{}_{}/".format(model._device_prefix, from_device)
activations = set([prefix + a for a in activations])
all_activations = set([prefix + a for a in GetActivationBlobs(model)])
ops = list(op for op in model.net.Proto().op if
op.device_option.cuda_gpu_id == from_device)
device_mapping = {a: to_device for a in activations}
device_mapping.update({b: from_device for b in all_activations if
b not in activations})
# Assign each blob to a device in a label propagation manner. activations
# override, and if multiple activations in same op, the output activations
# determine.
for op in ops:
op_device = None
for b in list(op.input) + list(op.output):
if b in device_mapping:
if b in all_activations or op_device is None:
op_device = device_mapping[b]
if op_device is None:
op_device = op.device_option.cuda_gpu_id
for b in list(op.input) + list(op.output):
if b not in device_mapping and b.startswith(prefix):
device_mapping[b] = op_device
op.device_option.cuda_gpu_id = op_device
# Change param_init_net accordingly
for op in model.param_init_net.Proto().op:
if op.output[0] in device_mapping:
op.device_option.cuda_gpu_id = device_mapping[op.output[0]]
def ShiftActivationDevices(model, activations, shifts):
'''
Function to enable simple model-parallellism for data_parallel_model
models. 'shifts' is a dictionary from_gpu -> to_gpu, and activations is
a list of activation blobs (wout gpu_x/ prefix -- use GetActivationBlobs()).
Operators handling these activations are shifted to the gpu declared in
'shifts'. Also related operators such as gradient operators will be moved.
Appropriate copy-ops are inserted.
This allows shifting memory usage from one gpu to another, enabling bigger
models to be trained.
'''
assert set(viewvalues(shifts)).intersection(set(viewkeys(shifts))) == set()
for from_device, to_device in viewitems(shifts):
log.info(
"Shifting {} activations from {} --> {}".
format(len(activations), from_device, to_device)
)
_ShiftActivationDevices(model, activations, from_device, to_device)
param_init_net, blob_to_device = core.InjectCrossDeviceCopies(model.param_init_net)
net, _blob_to_device = core.InjectCrossDeviceCopies(model.net, blob_to_device)
model.param_init_net = param_init_net
model.net = net