pytorch/caffe2/python/mkl/rewrite_graph.py



import copy
from caffe2.proto import caffe2_pb2
from caffe2.python import core


def rewrite_init_net_simple(net):
    for op in net.op:
        op.device_option.device_type = caffe2_pb2.IDEEP

def last_producer(ops, blob):
    for (i, op) in reversed(list(enumerate(ops))):
        if blob in op.output:
            return i
    raise ValueError("Failed to find last producer of blob, %s", blob)


def fix_BoxWithNMSLimit(net):
    outputs = set()
    for op in net.op:
        if op.type == 'BoxWithNMSLimit':
            outputs.add(op.output[0])
            outputs.add(op.output[1])
            outputs.add(op.output[2])
    for op in net.op:
        if op.type == 'CopyIDEEPToCPU':
            if op.input[0] in outputs:
                print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0]))
                op.type = 'Copy'
                op.device_option.device_type = caffe2_pb2.CPU


def rewrite_run_net_simple(net):
    # Simple rewrite for now - assume entire graph can be executed
    # with MKL, so just insert copy ops for external_input[0] and
    # external_output[0]
    def mkl_tmp(name):
        return "{}__MKL__".format(name)

    input_blob = net.external_input[0]
    if input_blob != net.op[0].input[0]:
        raise Exception(
            "Input blob: {} is not consumed by first op: {}".format(
                input_blob, net.op[0]))
    # Modify input/outputs to point to copied MKL blobs.
    from_cpu = "CopyCPUToIDEEP"
    to_cpu = "CopyIDEEPToCPU"
    copy_input_op = core.CreateOperator(
        from_cpu, input_blob, mkl_tmp(input_blob))
    net.op[0].input[0] = mkl_tmp(input_blob)

    copy_output_ops = [
        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
        for output_blob in net.external_output]

    for output_blob in net.external_output:
        last_producer_idx = last_producer(net.op, output_blob)
        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
                           for blob in net.op[last_producer_idx].output]
        net.op[last_producer_idx].output[:] = renamed_outputs
        # Rename any subsequent consumers of an output blob.
        for op in net.op[last_producer_idx + 1:]:
            renamed_input = [blob if blob != output_blob else mkl_tmp(blob)
                             for blob in op.input]
            op.input[:] = renamed_input

    ops = [copy_input_op] + net.op[:] + copy_output_ops
    del net.op[:]
    net.op.extend(ops)
    device = caffe2_pb2.IDEEP
    for op in net.op:
        op.device_option.MergeFrom(
            core.DeviceOption(device_type=device))
        op.engine = ""

    # Temporarily disable conv+relu fusion until we verify further
    # net.ParseFromString(
    #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
    fix_BoxWithNMSLimit(net)


def rewrite_run_net_simple_xrayocr_lstm(net):
    # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
    # enable mkl, then copy the temporary output blob at the break point
    # and all external inputs for lstm part to cpu, and execuate rest of the net
    # (two lstm) on cpu
    # This only works for the xrayocr lstm model which uses the first 'Shape' op
    # to decide the break point, and after two lstm it's external_output
    # directly so there's no need to copy back to ideep/mkl

    def mkl_tmp(name):
        return "{}__MKL__".format(name)

    def cpu_tmp(name):
        return "{}__CPU__".format(name)

    input_blob = net.external_input[0]
    if input_blob != net.op[0].input[0]:
        raise Exception(
            "Input blob: {} is not consumed by first op: {}".format(
                input_blob, net.op[0]))
    # Modify input/outputs to point to copied MKL blobs.
    from_cpu = "CopyCPUToIDEEP"
    to_cpu = "CopyIDEEPToCPU"
    copy_input_op = core.CreateOperator(
        from_cpu, input_blob, mkl_tmp(input_blob))
    net.op[0].input[0] = mkl_tmp(input_blob)

    # the net may contain some external_inputs falsely added during ONNX->Caffe2
    # This should be taken care of in early steps during pytorch_to_caffe2,
    # but if not it can cause issue in follow up steps, so check here to confirm
    for input_blob in net.external_input:
        for op in net.op:
            # look for if the external_input blob is output of any op in the net
            assert input_blob not in op.output

    external_output = None
    external_inputs_to_cpu = set()
    find_first_shape_op = False
    cpu_op_start_idx = -1
    for op_idx, op in enumerate(net.op):
        # the first Shape op mark the starting point of LSTM chunk of the net
        if not find_first_shape_op:
            if op.type == 'Shape':
                external_output = op.input
                find_first_shape_op = True
                cpu_op_start_idx = op_idx
        else:
            # any external input in the LSTM part need to be copied to CPU
            for in_blob in op.input:
                if in_blob in net.external_input:
                    external_inputs_to_cpu.add(in_blob)

    # make sure we found the expected break point of the net
    assert external_output is not None

    # create op to copy external input blobs used in LSTM part from IDEEP to CPU
    copy_extra_input_ops = []
    for in_blob in external_inputs_to_cpu:
        copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
                                                        cpu_tmp(in_blob)))
        # rename input blobs in LSTM part to use the CPU copy
        for op in net.op[cpu_op_start_idx:]:
            renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
                             for blob in op.input]
            op.input[:] = renamed_input

    copy_output_ops = [
        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
        for output_blob in external_output]

    for output_blob in external_output:
        last_producer_idx = last_producer(net.op, output_blob)
        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
                           for blob in net.op[last_producer_idx].output]
        net.op[last_producer_idx].output[:] = renamed_outputs

    # rearrange all ops in correct order
    ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
          + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
    del net.op[:]
    net.op.extend(ops)

    device = caffe2_pb2.IDEEP
    for op in net.op:
        # the first Shape op mark the starting point of LSTM chunk of the net
        if op.type == 'Shape':
            # all LSTM ops should run on CPU
            device = caffe2_pb2.CPU
        op.device_option.MergeFrom(
            core.DeviceOption(device_type=device))
        op.engine = ""

        # RecurrentNetwork has a nested step_net that needs special treatment
        if op.type == 'RecurrentNetwork':
            for arg in op.arg:
                if arg.name == 'step_net':
                    for nested_op in arg.n.op:
                        # set device to CPU
                        nested_op.device_option.MergeFrom(
                            core.DeviceOption(device_type=device))
                        nested_op.engine = ""

                        # rename inputs in op of nested net
                        renamed_input = []
                        for blob in nested_op.input:
                            renamed_input.append(blob
                                if blob not in external_inputs_to_cpu
                                else cpu_tmp(blob))
                        nested_op.input[:] = renamed_input

                    # rename external inputs of nested net
                    new_external_input = []
                    for blob in arg.n.external_input:
                        new_external_input.append(blob
                            if blob not in external_inputs_to_cpu
                            else cpu_tmp(blob))
                    arg.n.external_input[:] = new_external_input

    # Temporarily disable conv+relu fusion until we verify further
    # net.ParseFromString(
    #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
    fix_BoxWithNMSLimit(net)


def rewrite_model_helper_simple(model):
    model = copy.deepcopy(model)
    # All parameter initialization should run on MKL
    rewrite_init_net_simple(model.param_init_net.Proto())
    rewrite_run_net_simple(model.net.Proto())
    return model