pytorch/caffe2/python/core.py
2016-03-08 16:48:19 -08:00

463 lines
18 KiB
Python

import atexit
import sys
try:
from .libcaffe2_python import *
except ImportError as e:
print(str(e))
print('Pycaffe is not available. Exiting.')
sys.exit(1)
# libcaffe2_python contains a global Workspace that we need to properly delete
# when exiting. Otherwise, cudart will cause segfaults sometimes.
atexit.register(OnModuleExit)
from caffe2.proto import caffe2_pb2
from collections import Counter, defaultdict
from caffe2.python import utils, workspace
_REGISTERED_OPERATORS = set(
s.decode() for s in workspace.RegisteredOperators())
def IsOperator(op_type):
return (op_type in _REGISTERED_OPERATORS)
def GetGradientName(name):
"""The function that returns the gradient name for a blob."""
return name + '_grad'
def IsGradientName(name):
return name.endswith('_grad')
def GetOriginalName(name):
"""THe function that returns the original name for a gradient blob."""
if not name.endswith('_grad'):
raise RuntimeError('The blob ' + name + ' is not a legal gradient name.')
return name[:-5]
class BlobReference(object):
"""A wrapper around a blob in a net.
BlobReference gives us a way to refer to the network that the blob is
generated from. Note that blobs are, essentially, just strings in the current
workspace.
"""
def __init__(self, name, net):
self._name = name
self._from_net = net
# meta allows helper functions to put whatever metainformation needed there.
self.meta = {}
def __str__(self):
return self._name
def __add__(self, other):
if not isinstance(other, str):
raise RuntimeError('Cannot add BlobReference to a non-string.')
return self._name + other
def Net(self):
return self._from_net
def Grad(self):
return GetGradientName(self._name)
def __getattr__(self, op_type):
"""A wrapper allowing one to initiate operators from a blob reference.
Example: for a blob reference b that comes from network n, doing
b.Relu(...)
is equivalent to doing
net.Relu([b], ...)
"""
if not IsOperator(op_type):
raise RuntimeError(
'Method ' + op_type + ' is not a registered operator.')
def _CreateAndAddToNet(inputs=[], *args, **kwargs):
"""Internal function that routes the operator generation to the network's
__getattr__ function.
"""
if isinstance(inputs, BlobReference) or isinstance(inputs, str):
inputs = [inputs]
# add self to the input list.
inputs.insert(0, self)
return self._from_net.__getattr__(op_type)(inputs, *args, **kwargs)
return _CreateAndAddToNet
def CreateOperator(operator_type):
"""A function wrapper that allows one to create operators based on the
operator type. The type should be a string corresponding to an operator
registered with Caffe2.
"""
def ReallyCreate(inputs, outputs, name='', device_option=None,
arg=None, engine=None, **kwargs):
operator = caffe2_pb2.OperatorDef()
operator.type = operator_type
operator.name = name
if type(inputs) is str or type(inputs) is BlobReference:
inputs = [inputs]
elif type(inputs) is not list:
raise ValueError("Unknown input format: %s of type %s."
% (str(inputs), type(inputs)))
if type(outputs) is str or type(outputs) is BlobReference:
outputs = [outputs]
elif type(outputs) is not list:
raise ValueError("Unknown output format: %s of type %s."
% (str(outputs), type(outputs)))
operator.input.extend([str(i) for i in inputs])
operator.output.extend([str(o) for o in outputs])
if device_option is not None:
operator.device_option.CopyFrom(device_option)
if engine is not None:
operator.engine = engine
# random seed is defined in the device option, so we need to do special
# care.
if 'random_seed' in kwargs:
operator.device_option.random_seed = kwargs['random_seed']
del kwargs['random_seed']
# Add given arguments that do not need parsing
if arg is not None:
operator.arg.extend(arg)
# Add all other arguments
for key, value in kwargs.items():
operator.arg.add().CopyFrom(utils.MakeArgument(key, value))
return operator
return ReallyCreate
class GradientRegistry(object):
"""GradientRegistry holds the mapping from operators to their gradients."""
gradient_registry_ = {}
@classmethod
def RegisterGradient(cls, op_type):
"""A decorator for registering gradient mappings."""
def Wrapper(func):
cls.gradient_registry_[op_type] = func
return func
return Wrapper
@classmethod
def GetGradientDefsCC(cls, op_def):
grad_defs_str = cc_GetGradientDefs(op_def.SerializeToString())
grad_defs = []
for grad_def_str in grad_defs_str:
grad_def = caffe2_pb2.OperatorDef();
grad_def.ParseFromString(grad_def_str)
grad_defs.append(grad_def)
return grad_defs
@classmethod
def GetGradientDefs(cls, op):
try:
gradient_ops = cls.GetGradientDefsCC(op)
except:
# Not supported in C++; will try python registration next.
try:
gradient_ops = cls.gradient_registry_[op.type](op)
except KeyError as err:
raise KeyError('No gradient registered for op: %s' % op.type)
if gradient_ops is None:
return []
if type(gradient_ops) is not list:
gradient_ops = [gradient_ops]
if op.HasField("device_option"):
for gradient_op in gradient_ops:
gradient_op.device_option.CopyFrom(op.device_option)
if op.HasField("engine"):
for gradient_op in gradient_ops:
gradient_op.engine = op.engine
return gradient_ops
@classmethod
def GetBackwardPass(cls, operators, external_gradients=[]):
# (1) "Play" the forward pass of the network, so we know the version of any
# tensors that are being written multiple times.
# After running this, we will have:
# a) fwd_metadata: a list of [op, input_versions, output_versions]
# recording the input and the output version of the operator.
# b) versioned_input_count: a dictionary specifying for each blob and each
# of its version, how many times it is used as input for another op.
# c) current_versions: maintaining the current versions of the tensors we
# are having in the workspace. This is useful because if a gradient is
# trying to access an earlier version of a blob, we know that it is no
# longer there, and thus it should not be referred to at all.
current_versions = defaultdict(int)
versioned_input_count = defaultdict(lambda: defaultdict(int))
fwd_metadata = []
for op in operators:
# For input, they are the current version in the dict.
input_versions = dict()
for s in op.input:
input_versions[s] = current_versions[s]
versioned_input_count[s][current_versions[s]] += 1
# For output, they are the current version plus one. If this is a newly
# created blob, its version starts with zero.
output_versions = dict()
for s in op.output:
if s in current_versions:
current_versions[s] += 1
output_versions[s] = current_versions[s]
fwd_metadata.append([op, input_versions, output_versions])
# (2) Now, after having the virtual play above, we now play the operators
# backwards, creating the gradients along the path. Note that although we
# are playing it backwards, any value being overwritten can not be
# recovered, and any reference to a blob already being overwritten would
# trigger an error.
all_gradient_ops = []
current_gradient_versions = dict(
(s, current_versions[GetOriginalName(s)]) for s in external_gradients)
versioned_gradient_count = defaultdict(lambda: defaultdict(int))
for forward_op, current_fwd_metadata in zip(operators[::-1], fwd_metadata[::-1]):
gradient_ops = cls.GetGradientDefs(forward_op)
# Now, the constraints for the inputs of the gradient operators are:
#
# (1) for inputs:
# (1a) If it is a gradient name, it should match the version of the
# corresponding output.
# (1b) If it is an output name, the current version should match the
# version when the operator was run.
# (1c) If it is an input name, the current version should match the
# version when the operator was run.
# (1d) If it is none of the above, it should be a blob that is generated
# locally by one of the previous gradient operators.
#
# (2) for outputs:
# (2a) If it is a gradient name, it must be the gradient name of an input
# blob, and we will mark the gradient as being corresponding to the
# version of the input.
# (2b) If it is anything else it is OK - we will simply "play" the op to
# update the current versions of blobs.
locally_generated_blobs = []
multiuse_input_ready_to_sum = []
for grad_op in gradient_ops:
for s in grad_op.input:
if IsGradientName(s):
if s not in current_gradient_versions:
raise RuntimeError(
'Input gradient name "%s" is referred to but '
'is never generated.' % s)
# This is a gradient name. We will need to check if this gradient is
# produced already, and if this is the gradient we want.
original_name = GetOriginalName(s)
if original_name not in current_fwd_metadata[2]:
raise RuntimeError(
'Input gradient name "%s" is not the gradient '
'of any of the op\'s output.' % s)
if (current_fwd_metadata[2][original_name] !=
current_gradient_versions[s]):
raise RuntimeError(
'Gradient name "%s" is expected to correspond to '
'version %d of "%s", but currently we have version %d.' %
(s, current_fwd_metadata[2][s], original_name,
current_gradient_versions[s]))
elif s in current_fwd_metadata[2]:
if (current_versions[s] != current_fwd_metadata[2][s]):
raise RuntimeError(
'Gradient operator needs output "%s" at version '
'%d, but currently we have version %d.' %
(s, current_fwd_metadata[2][s], current_versions[s]))
elif s in current_fwd_metadata[1]:
if (current_versions[s] != current_fwd_metadata[1][s]):
raise RuntimeError(
'Gradient operator needs input "%s" at version '
'%d, but currently we have version %d.' %
(s, current_fwd_metadata[1][s], current_versions[s]))
else:
if s not in locally_generated_blobs:
if s not in locally_generated_blobs:
raise RuntimeError(
'Blob name "%s" not in the scope of operator: %s\nand is '
'not generated by any of the local gradient operators.' %
(s, str(current_fwd_metadata[0])))
for idx, s in enumerate(grad_op.output):
if IsGradientName(s):
original_name = GetOriginalName(s)
if original_name not in current_fwd_metadata[1]:
raise RuntimeError(
'Output gradient name "%s" is not the '
'gradient of any of the op\'s input name.' % s)
# Set the current gradient version.
version = current_fwd_metadata[1][original_name]
current_gradient_versions[s] = version
# Now we should also check if the gradient we product is a multi-use
# input, in which case we will automatically add split nodes.
# TODO: Instead of adding split nodes, we can also choose to
# sequentially compute and accumulate gradients. Maybe implement
# that in the future.
if versioned_input_count[original_name][version] > 1:
grad_op.output[idx] = '_%s_autosplit_%d' % (
s, versioned_gradient_count[s][version])
versioned_gradient_count[s][version] += 1
assert (versioned_gradient_count[s][version] <=
versioned_input_count[original_name][version])
if (versioned_gradient_count[s][version] ==
versioned_input_count[original_name][version]):
# We have calculated all the autosplit gradients. Will need to
# add a sum after this gradient computation.
multiuse_input_ready_to_sum.append(
(s, versioned_gradient_count[s][version], grad_op))
else:
locally_generated_blobs.append(s)
# If some of the multi use inputs are ready to be summed, we will do so.
for s, count, source_op in multiuse_input_ready_to_sum:
additional_sum_op = CreateOperator('Sum')(
['_%s_autosplit_%d' % (s, i) for i in range(count)], [s])
if source_op.HasField('device_option'):
additional_sum_op.device_option.CopyFrom(source_op.device_option)
gradient_ops.append(additional_sum_op)
# Now, for bookkeeping purposes, we will need to "play" the gradient
# operators. The reason is that the gradient operators may (although in
# most cases they shouldn't) change some of the existing blobs, in which
# case this explicit bookkeeping is going to catch them.
for op in gradient_ops:
for s in op.output:
if s in current_versions:
current_versions[s] += 1
output_versions[s] = current_versions[s]
all_gradient_ops += gradient_ops
# After we have done computation for each op, we now have the gradient
# operators ready.
return all_gradient_ops
class Net(object):
operator_registry_ = {}
def __init__(self, name):
if type(name) is caffe2_pb2.NetDef:
# We rae initializing a network by a NetDef. In this case, we will
# initialize our network with the given netdef.
self._net = caffe2_pb2.NetDef()
self._net.CopyFrom(name)
# Set the next name index properly.
existing_names = set(
sum([list(op.input) for op in self._net.op], []) +
sum([list(op.output) for op in self._net.op], []))
prefix_len = len(self._net.name + '_blob_')
autogen_indices = [int(name[prefix_len:]) for name in existing_names
if name.startswith(self._net.name + '_blob_')]
if len(autogen_indices):
self._next_name_index = max(autogen_indices) + 1
else:
self._next_name_index = 0
else:
self._net = caffe2_pb2.NetDef()
self._net.name = name
self._next_name_index = 0
def __str__(self):
return self._net.name
def Proto(self):
return self._net
def NextName(self):
"""Returns the next name to be used, if you do not want to explicitly
name your blob."""
output_name = self._net.name + '_blob_' + str(self._next_name_index)
self._next_name_index += 1
return str(output_name)
def AddGradientOperators(self, skip=0, external_gradients=[]):
"""Add the gradient for operators in the net.
Inputs:
skip: skips the first n operators. This is provided mainly because a lot
of nets may use the first few operators for data generation like stuff
which really do not need to have gradients.
Currently, this is hard-coded for float operators if there are branches
(i.e. a blob is used as input to multiple operators). This is because the
inserted SplitOp is hard-coded for float (its gradient, SumOp, is float
only). Supporting other formats is a todo item.
"""
grad_ops = GradientRegistry.GetBackwardPass(self._net.op[skip:])
self._net.op.extend(grad_ops)
return
def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
"""A convenient function to run everything on the GPU."""
device_option = caffe2_pb2.DeviceOption()
device_option.device_type = caffe2_pb2.CUDA
device_option.cuda_gpu_id = gpu_id
self._net.device_option.CopyFrom(device_option)
if use_cudnn:
for op in self._net.op:
op.engine = "CUDNN"
def __getattr__(self, op_type):
if not IsOperator(op_type):
raise RuntimeError(
'Method ' + op_type + ' is not a registered operator.')
def _CreateAndAddToSelf(inputs, outputs=None, **kwargs):
if outputs is None:
# If we do not specify an output, we will assume that this operator
# produces one output in this case.
outputs = self.NextName()
elif type(outputs) is int:
# In this case, we will auto-fill the given number of outputs with
# auto-generated names.
outputs = [self.NextName() for i in range(outputs)]
op = CreateOperator(op_type)(inputs, outputs, **kwargs)
self._net.op.extend([op])
if len(op.output) == 0:
return
elif len(op.output) == 1:
return BlobReference(str(op.output[0]), self)
else:
return tuple(BlobReference(str(o), self) for o in op.output)
return _CreateAndAddToSelf
class ExecutionStep(object):
def __init__(self, name):
self._step = caffe2_pb2.ExecutionStep()
self._step.name = name
def __init__(self, name, nets, num_iter=None):
self._step = caffe2_pb2.ExecutionStep()
self._step.name = name
if type(nets) is Net:
nets = [nets]
self._step.network.extend([str(n) for n in nets])
if num_iter is not None:
self._step.num_iter = num_iter
def __str__(self):
return self._step.name
def Proto(self):
return self._step
def SetIter(self, num_iter):
self._step.num_iter = num_iter
def AddSubstep(self, substep):
self._step.substep.add().CopyFrom(substep)
def AddNet(self, net):
self._step.network.add(str(net))
class Plan(object):
def __init__(self, name):
self._plan = caffe2_pb2.PlanDef()
self._plan.name = name
def __str__(self):
return self._plan.name
def Proto(self):
return self._plan
def AddNets(self, nets):
for net in nets:
self._plan.network.add().CopyFrom(net.Proto())
def AddStep(self, step):
self._plan.execution_step.add().CopyFrom(step.Proto())