pytorch/caffe2/operators/operator_fallback_gpu.h

#ifndef CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_
#define CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_

#include "caffe2/core/common.h"
#include "caffe2/core/context.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/core/operator.h"
#include "caffe2/proto/caffe2.pb.h"

namespace caffe2 {

template <int... values>
class SkipIndices {
 private:
  template <int... V>
  static inline bool ContainsInternal(const int i) {
    return false;
  }
  template <int First, int... Rest>
  static inline bool ContainsInternal(const int i) {
    return (i == First) && ContainsInternal<Rest...>(i);
  }

 public:
  static inline bool Contains(const int i) {
    return ContainsInternal<values...>(i);
  }
};

/**
 * @brief A templated class to allow one to wrap a CPU operator as a CUDA
 * operator.
 *
 * This class can be used when one does not have the CUDA implementation ready
 * yet for an operator. Essentially, what this op does is to automatically
 * deal with data copy for you. Plausibly, this causes a lot of overhead and
 * is not optimal, so you should use this operator mostly for quick prototyping
 * purpose.
 *
 * All the input and output of the original operator should be TensorCPU.
 *
 * Example usage: if you have a class MyMagicOp that is CPU based, and you use
 * the registration code
 *     REGISTER_CPU_OPERATOR(MyMagic, MyMagicOp);
 * to register the CPU side, you can create its corresponding GPU operator
 * (with performance hits of course) via
 *     REGISTER_CUDA_OPERATOR(MyMagic,
 *                            GPUFallbackOp<MyMagicOp>);
 *
 * Advanced usage: if you want to have some specific outputs never copied, you
 * can use the SkipOutputCopy template argument to do that. For example, if
 * MyMagic produces two outputs and the first output is always going to live on
 * the CPU, you can do
 *     REGISTER_CUDA_OPERATOR(MyMagic,
 *                            GPUFallbackOp<MyMagicOp, SkipIndices<0>>);
 */
template <class CPUOp, typename SkipOutputCopy = SkipIndices<>>
class GPUFallbackOp final : public Operator<CUDAContext> {
 public:
  USE_OPERATOR_FUNCTIONS(CUDAContext);
  GPUFallbackOp(const OperatorDef& def, Workspace* ws)
      : Operator<CUDAContext>(def, ws) {
    CHECK_EQ(def.device_option().device_type(), CUDA);
    OperatorDef base_def_(def);
    // base_def_ runs on CPU, so we will set its device option to CPU.
    base_def_.clear_device_option();
    base_def_.mutable_device_option()->set_device_type(CPU);
    // Set up the symbols for the local workspace.
    for (const string& name : def.input()) {
      local_input_blobs_.push_back(local_ws_.CreateBlob(name));
      CHECK_NOTNULL(local_input_blobs_.back());
    }
    base_op_.reset(new CPUOp(base_def_, &local_ws_));
    for (const string& name : def.output()) {
      local_output_blobs_.push_back(local_ws_.GetBlob(name));
      CHECK_NOTNULL(local_output_blobs_.back());
    }
  }

  bool RunOnDevice() override {
    bool need_sync = false;
    for (int i = 0; i < InputSize(); ++i) {
      if (OperatorBase::InputIsType<TensorCUDA>(i)) {
        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
            Input(i), &context_);
        need_sync = true;
      } else {
        VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
        // Note(jiayq): This removes a const but conceptually
        // local_input_blobs will only be used as const blob input for the
        // base op so we are still fine.
        local_input_blobs_[i]->ShareExternal(
            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
            OperatorBase::Inputs()[i]->meta());
      }
    }

    // Sync to make sure copies are done.
    if (need_sync) {
      context_.FinishDeviceComputation();
    }

    if (!base_op_->Run()) {
      LOG(ERROR) << "Base op run failed in GPUFallbackOp. Def: "
                      << ProtoDebugString(def());
      return false;
    }
    for (int i = 0; i < OutputSize(); ++i) {
      if (SkipOutputCopy::Contains(i)) {
        VLOG(1) << "Copy output: index " << i << " skipped.";
        continue;
      }
      CAFFE_ENFORCE(
          local_output_blobs_[i]->template IsType<TensorCPU>(),
          "GPU fallback op currently does not support non-TensorCPU "
          "output type who needs copying.");
      Output(i)->CopyFrom(
          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
    }
    return true;
  }

 protected:
  Workspace local_ws_;
  vector<Blob*> local_input_blobs_;
  vector<Blob*> local_output_blobs_;
  std::unique_ptr<CPUOp> base_op_;
};

}  // namespace caffe2

#endif  // CAFFE2_OPERATORS_OPERATOR_FALLBACK_H_