pytorch/caffe2/operators/operator_fallback_gpu.h
Yangqing Jia 1ede7a7ff0 more build updates:
(1) nccl submodule, cnmem submodule
(2) mpi ops fallback test
(3) a bit more blob interface
(4) fixed tests
(5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts
(6) In the build system autogen __init__.py instead of having manual
rules just to copy over an empty __init__.py.
2016-08-02 23:28:23 -07:00

97 lines
3.5 KiB
C++

#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
/**
* @brief A templated class to allow one to wrap a CPU operator as a CUDA
* operator.
*
* This class can be used when one does not have the CUDA implementation ready
* yet for an operator. Essentially, what this op does is to automatically
* deal with data copy for you. Plausibly, this causes a lot of overhead and
* is not optimal, so you should use this operator mostly for quick prototyping
* purpose.
*
* All the input and output of the original operator should be TensorCPU.
*
* Example usage: if you have a class MyMagicOp that is CPU based, and you use
* the registration code
* REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
* to register the CPU side, you can create its corresponding GPU operator
* (with performance hits of course) via
* REGISTER_CUDA_OPERATOR(MyMagic,
* GPUFallbackOp<MyMagicOp>);
*/
template <class CPUOp>
class GPUFallbackOp final : public Operator<CUDAContext> {
public:
USE_OPERATOR_FUNCTIONS(CUDAContext);
GPUFallbackOp(const OperatorDef& def, Workspace* ws)
: Operator<CUDAContext>(def, ws) {
CHECK_EQ(def.device_option().device_type(), CUDA);
OperatorDef base_def_(def);
// base_def_ runs on CPU, so we will set its device option to CPU.
base_def_.clear_device_option();
base_def_.mutable_device_option()->set_device_type(CPU);
// Set up the symbols for the local workspace.
for (const string& name : def.input()) {
local_input_blobs_.push_back(local_ws_.CreateBlob(name));
CHECK_NOTNULL(local_input_blobs_.back());
}
base_op_.reset(new CPUOp(base_def_, &local_ws_));
for (const string& name : def.output()) {
local_output_blobs_.push_back(local_ws_.GetBlob(name));
CHECK_NOTNULL(local_output_blobs_.back());
}
}
bool RunOnDevice() override {
for (int i = 0; i < InputSize(); ++i) {
if (OperatorBase::InputIsType<TensorCUDA>(i)) {
local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
Input(i), &context_);
} else {
VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
// Note(jiayq): This removes a const but conceptually
// local_input_blobs will only be used as const blob input for the
// base op so we are still fine.
local_input_blobs_[i]->ShareExternal(
const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
OperatorBase::Inputs()[i]->meta());
}
}
// Sync to make sure copies are done.
context_.FinishDeviceComputation();
if (!base_op_->Run()) {
LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
<< ProtoDebugString(def());
return false;
}
for (int i = 0; i < OutputSize(); ++i) {
CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
"GPU fallback op currently does not support non-TensorCPU "
"output type.");
Output(i)->CopyFrom(
local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
}
return true;
}
protected:
Workspace local_ws_;
vector<Blob*> local_input_blobs_;
vector<Blob*> local_output_blobs_;
std::unique_ptr<CPUOp> base_op_;
};
} // namespace caffe2
#endif // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_