mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/74169 Alias DB was being way too conservative about the semantics of exported Caffe2 ops - it thought some pure functions were writing to their inputs, which caused `ReplaceWithMaybeCopy` to fail. This in turn lead to a huge decrease in out variant coverage and regressions in many models. I've extended the export macro to let the user specify an `AliasAnalysisKind` and marked all of the quantization compression ops as pure functions. ghstack-source-id: 151394133 Reviewed By: hlu1 Differential Revision: D34733630 fbshipit-source-id: e968812e052f14261c10f9a280abe1d910de1f2f (cherry picked from commit 5e9de49b98caff57be13e8bd101144ae2475b6b5)
210 lines
6.6 KiB
C++
210 lines
6.6 KiB
C++
#include "caffe2/operators/copy_op.h"
|
|
|
|
namespace caffe2 {
|
|
|
|
// From CPU, copy it to whatever the current context
|
|
REGISTER_CPU_OPERATOR(
|
|
CopyFromCPUInput,
|
|
CopyOp<CPUContext, CPUContext, CPUContext>);
|
|
REGISTER_CPU_OPERATOR(
|
|
CopyOnDeviceLike,
|
|
CopyOnDeviceLikeOp<CPUContext, CPUContext, CPUContext>);
|
|
REGISTER_CPU_OPERATOR(Copy, CopyOp<CPUContext, CPUContext, CPUContext>);
|
|
|
|
OPERATOR_SCHEMA(Copy)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.IdenticalTypeAndShape()
|
|
.InputsCanCrossDevices()
|
|
.InheritOnnxSchema("Identity")
|
|
.SetDoc(R"DOC(
|
|
Copy input tensor into output, potentially across devices.
|
|
|
|
Github Links:
|
|
|
|
- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.cc
|
|
- https://github.com/caffe2/caffe2/blob/master/caffe2/operators/copy_op.h
|
|
|
|
|
|
<details>
|
|
|
|
<summary> <b>Example</b> </summary>
|
|
|
|
**Code**
|
|
|
|
```
|
|
|
|
workspace.ResetWorkspace()
|
|
|
|
op = core.CreateOperator(
|
|
"Copy",
|
|
["input"],
|
|
["output"]
|
|
)
|
|
|
|
workspace.FeedBlob("input", np.random.rand(3,3))
|
|
print("input:", workspace.FetchBlob("input"))
|
|
workspace.RunOperatorOnce(op)
|
|
print("output:", workspace.FetchBlob("output"))
|
|
|
|
```
|
|
|
|
**Result**
|
|
|
|
```
|
|
|
|
input:
|
|
[[0.16826761 0.68168217 0.55196001]
|
|
[0.19735483 0.34837823 0.69015595]
|
|
[0.09448514 0.57390828 0.37097193]]
|
|
output:
|
|
[[0.16826761 0.68168217 0.55196001]
|
|
[0.19735483 0.34837823 0.69015595]
|
|
[0.09448514 0.57390828 0.37097193]]
|
|
|
|
```
|
|
|
|
</details>
|
|
|
|
)DOC")
|
|
.Input(0, "input", "(*Tensor*): input tensor to copy")
|
|
.Output(0, "output", "(*Tensor*): copy of input tensor");
|
|
|
|
OPERATOR_SCHEMA(CopyGPUToCPU)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.IdenticalTypeAndShape()
|
|
.InputsCanCrossDevices()
|
|
.DeviceInferenceFunction([](const OperatorDef& def) {
|
|
CAFFE_ENFORCE(
|
|
def.has_device_option(),
|
|
"CopyGPUToCPU op should have cuda device option.");
|
|
auto& cuda_option = def.device_option();
|
|
auto cpu_option = DeviceOption();
|
|
vector<DeviceOption> in_dev(def.input_size(), cuda_option);
|
|
vector<DeviceOption> out_dev(def.output_size(), cpu_option);
|
|
return std::make_pair(in_dev, out_dev);
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Copy tensor for GPU to CPU context. Must be run under GPU device option.
|
|
)DOC")
|
|
.Input(0, "input", "The input tensor.")
|
|
.Output(0, "output", "Tensor that will contain a copy of the input.");
|
|
|
|
OPERATOR_SCHEMA(CopyCPUToGPU)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.IdenticalTypeAndShape()
|
|
.InputsCanCrossDevices()
|
|
.DeviceInferenceFunction([](const OperatorDef& def) {
|
|
CAFFE_ENFORCE(
|
|
def.has_device_option(),
|
|
"CopyCPUToGPU op should have cuda device option.");
|
|
auto& cuda_option = def.device_option();
|
|
auto cpu_option = DeviceOption();
|
|
vector<DeviceOption> in_dev(def.input_size(), cpu_option);
|
|
vector<DeviceOption> out_dev(def.output_size(), cuda_option);
|
|
return std::make_pair(in_dev, out_dev);
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Copy tensor for CPU to GPU context. Must be run under GPU device option.
|
|
)DOC")
|
|
.Input(0, "input", "The input tensor.")
|
|
.Output(0, "output", "Tensor that will contain a copy of the input.");
|
|
|
|
OPERATOR_SCHEMA(CopyFromCPUInput)
|
|
.NumInputs(1)
|
|
.NumOutputs(1)
|
|
.IdenticalTypeAndShape()
|
|
.InputsCanCrossDevices()
|
|
.DeviceInferenceFunction([](const OperatorDef& def) {
|
|
auto op_device =
|
|
def.has_device_option() ? def.device_option() : DeviceOption();
|
|
auto cpu_option = DeviceOption();
|
|
vector<DeviceOption> in_dev(def.input_size(), cpu_option);
|
|
vector<DeviceOption> out_dev(def.output_size(), op_device);
|
|
return std::make_pair(in_dev, out_dev);
|
|
})
|
|
.SetDoc(R"DOC(
|
|
Take a CPU input tensor and copy it to an output in the current
|
|
Context (GPU or CPU). This may involves cross-device MemCpy.
|
|
)DOC")
|
|
.Input(0, "input", "The input CPU tensor.")
|
|
.Output(0, "output", "either a TensorCUDA or a TensorCPU");
|
|
|
|
OPERATOR_SCHEMA(CopyOnDeviceLike)
|
|
.NumInputs(2)
|
|
.NumOutputs(1)
|
|
.SetDoc("Copy input tensor into output to the specific device.")
|
|
.Input(0, "input", "The input tensor.")
|
|
.Input(1, "dst", "Tensor, on which device the copy will be performed.")
|
|
.Output(0, "output", "Tensor that will contain a copy of the input.");
|
|
|
|
struct GetCopyGradient : public GradientMakerBase {
|
|
using GradientMakerBase::GradientMakerBase;
|
|
vector<OperatorDef> GetGradientDefs() override {
|
|
return SingleGradientDef(
|
|
"CopyOnDeviceLike",
|
|
"",
|
|
vector<string>{GO(0), I(0)},
|
|
vector<string>{GI(0)});
|
|
}
|
|
};
|
|
REGISTER_GRADIENT(Copy, GetCopyGradient);
|
|
|
|
struct GetGPUToCPUGradient : public GradientMakerBase {
|
|
using GradientMakerBase::GradientMakerBase;
|
|
vector<OperatorDef> GetGradientDefs() override {
|
|
if (g_output_[0].IsDense()) {
|
|
return SingleGradientDef(
|
|
"CopyCPUToGPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
|
|
} else {
|
|
return vector<OperatorDef>{CreateOperatorDef(
|
|
"CopyCPUToGPU",
|
|
"",
|
|
std::vector<string>{GO_I(0)},
|
|
std::vector<string>{GI_I(0)}),
|
|
CreateOperatorDef(
|
|
"CopyCPUToGPU",
|
|
"",
|
|
std::vector<string>{GO_V(0)},
|
|
std::vector<string>{GI_V(0)})};
|
|
}
|
|
}
|
|
};
|
|
REGISTER_GRADIENT(CopyGPUToCPU, GetGPUToCPUGradient);
|
|
|
|
struct GetCPUToGPUGradient : public GradientMakerBase {
|
|
using GradientMakerBase::GradientMakerBase;
|
|
vector<OperatorDef> GetGradientDefs() override {
|
|
if (g_output_[0].IsDense()) {
|
|
return SingleGradientDef(
|
|
"CopyGPUToCPU", "", vector<string>{GO(0)}, vector<string>{GI(0)});
|
|
} else {
|
|
return vector<OperatorDef>{CreateOperatorDef(
|
|
"CopyGPUToCPU",
|
|
"",
|
|
std::vector<string>{GO_I(0)},
|
|
std::vector<string>{GI_I(0)}),
|
|
CreateOperatorDef(
|
|
"CopyGPUToCPU",
|
|
"",
|
|
std::vector<string>{GO_V(0)},
|
|
std::vector<string>{GI_V(0)})};
|
|
}
|
|
}
|
|
};
|
|
REGISTER_GRADIENT(CopyCPUToGPU, GetCPUToGPUGradient);
|
|
|
|
} // namespace caffe2
|
|
|
|
C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
|
|
CopyGPUToCPU,
|
|
"_caffe2::CopyGPUToCPU(Tensor input) -> Tensor",
|
|
/*optional_alias_analysis_kind=*/c10::nullopt);
|
|
|
|
C10_EXPORT_CAFFE2_OP_TO_C10_SCHEMA_ONLY(
|
|
CopyCPUToGPU,
|
|
"_caffe2::CopyCPUToGPU(Tensor input) -> Tensor",
|
|
/*optional_alias_analysis_kind=*/c10::nullopt);
|