import atexit import sys try: from .libcaffe2_python import * except ImportError as e: print(str(e)) print('Pycaffe is not available. Exiting.') sys.exit(1) # libcaffe2_python contains a global Workspace that we need to properly delete # when exiting. Otherwise, cudart will cause segfaults sometimes. atexit.register(OnModuleExit) from caffe2.proto import caffe2_pb2 from collections import Counter, defaultdict from caffe2.python import utils, workspace _REGISTERED_OPERATORS = set( s.decode() for s in workspace.RegisteredOperators()) def IsOperator(op_type): return (op_type in _REGISTERED_OPERATORS) def GetGradientName(name): """The function that returns the gradient name for a blob.""" return name + '_grad' def IsGradientName(name): return name.endswith('_grad') def GetOriginalName(name): """THe function that returns the original name for a gradient blob.""" if not name.endswith('_grad'): raise RuntimeError('The blob ' + name + ' is not a legal gradient name.') return name[:-5] class BlobReference(object): """A wrapper around a blob in a net. BlobReference gives us a way to refer to the network that the blob is generated from. Note that blobs are, essentially, just strings in the current workspace. """ def __init__(self, name, net): self._name = name self._from_net = net # meta allows helper functions to put whatever metainformation needed there. self.meta = {} def __str__(self): return self._name def __add__(self, other): if not isinstance(other, str): raise RuntimeError('Cannot add BlobReference to a non-string.') return self._name + other def Net(self): return self._from_net def Grad(self): return GetGradientName(self._name) def __getattr__(self, op_type): """A wrapper allowing one to initiate operators from a blob reference. Example: for a blob reference b that comes from network n, doing b.Relu(...) is equivalent to doing net.Relu([b], ...) """ if not IsOperator(op_type): raise RuntimeError( 'Method ' + op_type + ' is not a registered operator.') def _CreateAndAddToNet(inputs=[], *args, **kwargs): """Internal function that routes the operator generation to the network's __getattr__ function. """ if isinstance(inputs, BlobReference) or isinstance(inputs, str): inputs = [inputs] # add self to the input list. inputs.insert(0, self) return self._from_net.__getattr__(op_type)(inputs, *args, **kwargs) return _CreateAndAddToNet def CreateOperator(operator_type): """A function wrapper that allows one to create operators based on the operator type. The type should be a string corresponding to an operator registered with Caffe2. """ def ReallyCreate(inputs, outputs, name='', device_option=None, arg=None, engine=None, **kwargs): operator = caffe2_pb2.OperatorDef() operator.type = operator_type operator.name = name if type(inputs) is str or type(inputs) is BlobReference: inputs = [inputs] elif type(inputs) is not list: raise ValueError("Unknown input format: %s of type %s." % (str(inputs), type(inputs))) if type(outputs) is str or type(outputs) is BlobReference: outputs = [outputs] elif type(outputs) is not list: raise ValueError("Unknown output format: %s of type %s." % (str(outputs), type(outputs))) operator.input.extend([str(i) for i in inputs]) operator.output.extend([str(o) for o in outputs]) if device_option is not None: operator.device_option.CopyFrom(device_option) if engine is not None: operator.engine = engine # random seed is defined in the device option, so we need to do special # care. if 'random_seed' in kwargs: operator.device_option.random_seed = kwargs['random_seed'] del kwargs['random_seed'] # Add given arguments that do not need parsing if arg is not None: operator.arg.extend(arg) # Add all other arguments for key, value in kwargs.items(): operator.arg.add().CopyFrom(utils.MakeArgument(key, value)) return operator return ReallyCreate class GradientRegistry(object): """GradientRegistry holds the mapping from operators to their gradients.""" gradient_registry_ = {} @classmethod def RegisterGradient(cls, op_type): """A decorator for registering gradient mappings.""" def Wrapper(func): cls.gradient_registry_[op_type] = func return func return Wrapper @classmethod def GetGradientDefsCC(cls, op_def): grad_defs_str = cc_GetGradientDefs(op_def.SerializeToString()) grad_defs = [] for grad_def_str in grad_defs_str: grad_def = caffe2_pb2.OperatorDef(); grad_def.ParseFromString(grad_def_str) grad_defs.append(grad_def) return grad_defs @classmethod def GetGradientDefs(cls, op): try: gradient_ops = cls.GetGradientDefsCC(op) except: # Not supported in C++; will try python registration next. try: gradient_ops = cls.gradient_registry_[op.type](op) except KeyError as err: raise KeyError('No gradient registered for op: %s' % op.type) if gradient_ops is None: return [] if type(gradient_ops) is not list: gradient_ops = [gradient_ops] if op.HasField("device_option"): for gradient_op in gradient_ops: gradient_op.device_option.CopyFrom(op.device_option) if op.HasField("engine"): for gradient_op in gradient_ops: gradient_op.engine = op.engine return gradient_ops @classmethod def GetBackwardPass(cls, operators, external_gradients=[]): # (1) "Play" the forward pass of the network, so we know the version of any # tensors that are being written multiple times. # After running this, we will have: # a) fwd_metadata: a list of [op, input_versions, output_versions] # recording the input and the output version of the operator. # b) versioned_input_count: a dictionary specifying for each blob and each # of its version, how many times it is used as input for another op. # c) current_versions: maintaining the current versions of the tensors we # are having in the workspace. This is useful because if a gradient is # trying to access an earlier version of a blob, we know that it is no # longer there, and thus it should not be referred to at all. current_versions = defaultdict(int) versioned_input_count = defaultdict(lambda: defaultdict(int)) fwd_metadata = [] for op in operators: # For input, they are the current version in the dict. input_versions = dict() for s in op.input: input_versions[s] = current_versions[s] versioned_input_count[s][current_versions[s]] += 1 # For output, they are the current version plus one. If this is a newly # created blob, its version starts with zero. output_versions = dict() for s in op.output: if s in current_versions: current_versions[s] += 1 output_versions[s] = current_versions[s] fwd_metadata.append([op, input_versions, output_versions]) # (2) Now, after having the virtual play above, we now play the operators # backwards, creating the gradients along the path. Note that although we # are playing it backwards, any value being overwritten can not be # recovered, and any reference to a blob already being overwritten would # trigger an error. all_gradient_ops = [] current_gradient_versions = dict( (s, current_versions[GetOriginalName(s)]) for s in external_gradients) versioned_gradient_count = defaultdict(lambda: defaultdict(int)) for forward_op, current_fwd_metadata in zip(operators[::-1], fwd_metadata[::-1]): gradient_ops = cls.GetGradientDefs(forward_op) # Now, the constraints for the inputs of the gradient operators are: # # (1) for inputs: # (1a) If it is a gradient name, it should match the version of the # corresponding output. # (1b) If it is an output name, the current version should match the # version when the operator was run. # (1c) If it is an input name, the current version should match the # version when the operator was run. # (1d) If it is none of the above, it should be a blob that is generated # locally by one of the previous gradient operators. # # (2) for outputs: # (2a) If it is a gradient name, it must be the gradient name of an input # blob, and we will mark the gradient as being corresponding to the # version of the input. # (2b) If it is anything else it is OK - we will simply "play" the op to # update the current versions of blobs. locally_generated_blobs = [] multiuse_input_ready_to_sum = [] for grad_op in gradient_ops: for s in grad_op.input: if IsGradientName(s): if s not in current_gradient_versions: raise RuntimeError( 'Input gradient name "%s" is referred to but ' 'is never generated.' % s) # This is a gradient name. We will need to check if this gradient is # produced already, and if this is the gradient we want. original_name = GetOriginalName(s) if original_name not in current_fwd_metadata[2]: raise RuntimeError( 'Input gradient name "%s" is not the gradient ' 'of any of the op\'s output.' % s) if (current_fwd_metadata[2][original_name] != current_gradient_versions[s]): raise RuntimeError( 'Gradient name "%s" is expected to correspond to ' 'version %d of "%s", but currently we have version %d.' % (s, current_fwd_metadata[2][s], original_name, current_gradient_versions[s])) elif s in current_fwd_metadata[2]: if (current_versions[s] != current_fwd_metadata[2][s]): raise RuntimeError( 'Gradient operator needs output "%s" at version ' '%d, but currently we have version %d.' % (s, current_fwd_metadata[2][s], current_versions[s])) elif s in current_fwd_metadata[1]: if (current_versions[s] != current_fwd_metadata[1][s]): raise RuntimeError( 'Gradient operator needs input "%s" at version ' '%d, but currently we have version %d.' % (s, current_fwd_metadata[1][s], current_versions[s])) else: if s not in locally_generated_blobs: if s not in locally_generated_blobs: raise RuntimeError( 'Blob name "%s" not in the scope of operator: %s\nand is ' 'not generated by any of the local gradient operators.' % (s, str(current_fwd_metadata[0]))) for idx, s in enumerate(grad_op.output): if IsGradientName(s): original_name = GetOriginalName(s) if original_name not in current_fwd_metadata[1]: raise RuntimeError( 'Output gradient name "%s" is not the ' 'gradient of any of the op\'s input name.' % s) # Set the current gradient version. version = current_fwd_metadata[1][original_name] current_gradient_versions[s] = version # Now we should also check if the gradient we product is a multi-use # input, in which case we will automatically add split nodes. # TODO: Instead of adding split nodes, we can also choose to # sequentially compute and accumulate gradients. Maybe implement # that in the future. if versioned_input_count[original_name][version] > 1: grad_op.output[idx] = '_%s_autosplit_%d' % ( s, versioned_gradient_count[s][version]) versioned_gradient_count[s][version] += 1 assert (versioned_gradient_count[s][version] <= versioned_input_count[original_name][version]) if (versioned_gradient_count[s][version] == versioned_input_count[original_name][version]): # We have calculated all the autosplit gradients. Will need to # add a sum after this gradient computation. multiuse_input_ready_to_sum.append( (s, versioned_gradient_count[s][version], grad_op)) else: locally_generated_blobs.append(s) # If some of the multi use inputs are ready to be summed, we will do so. for s, count, source_op in multiuse_input_ready_to_sum: additional_sum_op = CreateOperator('Sum')( ['_%s_autosplit_%d' % (s, i) for i in range(count)], [s]) if source_op.HasField('device_option'): additional_sum_op.device_option.CopyFrom(source_op.device_option) gradient_ops.append(additional_sum_op) # Now, for bookkeeping purposes, we will need to "play" the gradient # operators. The reason is that the gradient operators may (although in # most cases they shouldn't) change some of the existing blobs, in which # case this explicit bookkeeping is going to catch them. for op in gradient_ops: for s in op.output: if s in current_versions: current_versions[s] += 1 output_versions[s] = current_versions[s] all_gradient_ops += gradient_ops # After we have done computation for each op, we now have the gradient # operators ready. return all_gradient_ops class Net(object): operator_registry_ = {} def __init__(self, name): if type(name) is caffe2_pb2.NetDef: # We rae initializing a network by a NetDef. In this case, we will # initialize our network with the given netdef. self._net = caffe2_pb2.NetDef() self._net.CopyFrom(name) # Set the next name index properly. existing_names = set( sum([list(op.input) for op in self._net.op], []) + sum([list(op.output) for op in self._net.op], [])) prefix_len = len(self._net.name + '_blob_') autogen_indices = [int(name[prefix_len:]) for name in existing_names if name.startswith(self._net.name + '_blob_')] if len(autogen_indices): self._next_name_index = max(autogen_indices) + 1 else: self._next_name_index = 0 else: self._net = caffe2_pb2.NetDef() self._net.name = name self._next_name_index = 0 def __str__(self): return self._net.name def Proto(self): return self._net def NextName(self): """Returns the next name to be used, if you do not want to explicitly name your blob.""" output_name = self._net.name + '_blob_' + str(self._next_name_index) self._next_name_index += 1 return str(output_name) def AddGradientOperators(self, skip=0, external_gradients=[]): """Add the gradient for operators in the net. Inputs: skip: skips the first n operators. This is provided mainly because a lot of nets may use the first few operators for data generation like stuff which really do not need to have gradients. Currently, this is hard-coded for float operators if there are branches (i.e. a blob is used as input to multiple operators). This is because the inserted SplitOp is hard-coded for float (its gradient, SumOp, is float only). Supporting other formats is a todo item. """ grad_ops = GradientRegistry.GetBackwardPass(self._net.op[skip:]) self._net.op.extend(grad_ops) return def RunAllOnGPU(self, gpu_id=0, use_cudnn=False): """A convenient function to run everything on the GPU.""" device_option = caffe2_pb2.DeviceOption() device_option.device_type = caffe2_pb2.CUDA device_option.cuda_gpu_id = gpu_id self._net.device_option.CopyFrom(device_option) if use_cudnn: for op in self._net.op: op.engine = "CUDNN" def __getattr__(self, op_type): if not IsOperator(op_type): raise RuntimeError( 'Method ' + op_type + ' is not a registered operator.') def _CreateAndAddToSelf(inputs, outputs=None, **kwargs): if outputs is None: # If we do not specify an output, we will assume that this operator # produces one output in this case. outputs = self.NextName() elif type(outputs) is int: # In this case, we will auto-fill the given number of outputs with # auto-generated names. outputs = [self.NextName() for i in range(outputs)] op = CreateOperator(op_type)(inputs, outputs, **kwargs) self._net.op.extend([op]) if len(op.output) == 0: return elif len(op.output) == 1: return BlobReference(str(op.output[0]), self) else: return tuple(BlobReference(str(o), self) for o in op.output) return _CreateAndAddToSelf class ExecutionStep(object): def __init__(self, name): self._step = caffe2_pb2.ExecutionStep() self._step.name = name def __init__(self, name, nets, num_iter=None): self._step = caffe2_pb2.ExecutionStep() self._step.name = name if type(nets) is Net: nets = [nets] self._step.network.extend([str(n) for n in nets]) if num_iter is not None: self._step.num_iter = num_iter def __str__(self): return self._step.name def Proto(self): return self._step def SetIter(self, num_iter): self._step.num_iter = num_iter def AddSubstep(self, substep): self._step.substep.add().CopyFrom(substep) def AddNet(self, net): self._step.network.add(str(net)) class Plan(object): def __init__(self, name): self._plan = caffe2_pb2.PlanDef() self._plan.name = name def __str__(self): return self._plan.name def Proto(self): return self._plan def AddNets(self, nets): for net in nets: self._plan.network.add().CopyFrom(net.Proto()) def AddStep(self, step): self._plan.execution_step.add().CopyFrom(step.Proto())