mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: (Work in progress). This diff will allow shifting of activations to other GPUs, in case the model does not fit into memory. To see the API, check the code in data_parallel_model_test, which tests shifting two activations from 0 and 1 to gpu 4, and from gpu 2 and 3 to gpu 5. I will need to further test on ResNets, and probablly add copy operations to handle device change points. Reviewed By: asaadaldien Differential Revision: D5591674 fbshipit-source-id: eb12d23651a56d64fa4db91090c6474218705270
110 lines
4.3 KiB
Python
110 lines
4.3 KiB
Python
# Copyright (c) 2016-present, Facebook, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
##############################################################################
|
|
|
|
## @package data_parallel_model_utils
|
|
# Module caffe2.python.data_parallel_model_utils
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
from future.utils import viewitems, viewkeys, viewvalues
|
|
|
|
import logging
|
|
|
|
from caffe2.python import core
|
|
from caffe2.python.data_parallel_model import stripBlobName
|
|
|
|
log = logging.getLogger("data_parallel_model_utils")
|
|
log.setLevel(logging.INFO)
|
|
|
|
|
|
def GetActivationBlobs(model):
|
|
# Hacky way to get activations, think of a better way
|
|
activations = []
|
|
first_gpu_prefix = "{}_{}/".format(model._device_prefix, model._devices[0])
|
|
|
|
all_inputs = set()
|
|
for op in model.net.Proto().op:
|
|
for inp in op.input:
|
|
all_inputs.add(inp)
|
|
|
|
params = set(model.GetParams(''))
|
|
|
|
for op in model.net.Proto().op:
|
|
for b in op.output:
|
|
if b.startswith(first_gpu_prefix) and not b.endswith("_grad"):
|
|
if b in all_inputs and b not in params and b + "_grad" in all_inputs:
|
|
activations.append(stripBlobName(b))
|
|
return activations
|
|
|
|
|
|
def _ShiftActivationDevices(model, activations, from_device, to_device):
|
|
prefix = "{}_{}/".format(model._device_prefix, from_device)
|
|
activations = set([prefix + a for a in activations])
|
|
all_activations = set([prefix + a for a in GetActivationBlobs(model)])
|
|
ops = list(op for op in model.net.Proto().op if
|
|
op.device_option.cuda_gpu_id == from_device)
|
|
device_mapping = {a: to_device for a in activations}
|
|
device_mapping.update({b: from_device for b in all_activations if
|
|
b not in activations})
|
|
|
|
# Assign each blob to a device in a label propagation manner. activations
|
|
# override, and if multiple activations in same op, the output activations
|
|
# determine.
|
|
for op in ops:
|
|
op_device = None
|
|
for b in list(op.input) + list(op.output):
|
|
if b in device_mapping:
|
|
if b in all_activations or op_device is None:
|
|
op_device = device_mapping[b]
|
|
if op_device is None:
|
|
op_device = op.device_option.cuda_gpu_id
|
|
for b in list(op.input) + list(op.output):
|
|
if b not in device_mapping and b.startswith(prefix):
|
|
device_mapping[b] = op_device
|
|
op.device_option.cuda_gpu_id = op_device
|
|
|
|
# Change param_init_net accordingly
|
|
for op in model.param_init_net.Proto().op:
|
|
if op.output[0] in device_mapping:
|
|
op.device_option.cuda_gpu_id = device_mapping[op.output[0]]
|
|
|
|
|
|
def ShiftActivationDevices(model, activations, shifts):
|
|
'''
|
|
Function to enable simple model-parallellism for data_parallel_model
|
|
models. 'shifts' is a dictionary from_gpu -> to_gpu, and activations is
|
|
a list of activation blobs (wout gpu_x/ prefix -- use GetActivationBlobs()).
|
|
|
|
Operators handling these activations are shifted to the gpu declared in
|
|
'shifts'. Also related operators such as gradient operators will be moved.
|
|
Appropriate copy-ops are inserted.
|
|
|
|
This allows shifting memory usage from one gpu to another, enabling bigger
|
|
models to be trained.
|
|
'''
|
|
assert set(viewvalues(shifts)).intersection(set(viewkeys(shifts))) == set()
|
|
for from_device, to_device in viewitems(shifts):
|
|
log.info(
|
|
"Shifting {} activations from {} --> {}".
|
|
format(len(activations), from_device, to_device)
|
|
)
|
|
_ShiftActivationDevices(model, activations, from_device, to_device)
|
|
|
|
param_init_net, blob_to_device = core.InjectCrossDeviceCopies(model.param_init_net)
|
|
net, _blob_to_device = core.InjectCrossDeviceCopies(model.net, blob_to_device)
|
|
model.param_init_net = param_init_net
|
|
model.net = net
|