pytorch/caffe2/operators/utility_ops.h

#ifndef CAFFE2_OPERATORS_UTILITY_OPS_H_
#define CAFFE2_OPERATORS_UTILITY_OPS_H_

#include <cmath>
#include <map>
#include <utility>

#include "caffe2/core/common_omp.h"
#include "caffe2/core/context.h"
#include "caffe2/core/export_caffe2_op_to_c10.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/types.h"
#include "caffe2/operators/gather_op.h"
#include "caffe2/utils/conversions.h"
#include "caffe2/utils/math.h"

C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesOp);
C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsGatherOp);

namespace caffe2 {

template <class Context>
class NanCheckOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  template <class... Args>
  explicit NanCheckOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...) {}

  bool RunOnDevice() override;

 private:
  TensorPrinter tensorPrinter_;
  Tensor scratch_;
};

struct GetNanCheckGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  std::vector<OperatorDef> GetGradientDefs() override {
    return {CreateOperatorDef(
        "NanCheck",
        "",
        std::vector<string>{GO(0)},
        std::vector<string>{GI(0)})};
  }
};

template <class Context>
class IsNanOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  IsNanOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws) {}

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
  }

  template <typename T>
  bool DoRunWithType() {
    auto& X = Input(0);
    auto* Y = Output(0, X.sizes(), at::dtype<uint8_t>());
    const auto* X_data = X.template data<T>();
    uint8_t* Y_data = Y->template mutable_data<uint8_t>();
    for (size_t i = 0; i < X.numel(); i++) {
      Y_data[i] = (uint8_t)(std::isnan(X_data[i]));
    }
    return true;
  }
};

template <class Context>
class WallClockTimeOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;

  template <class... Args>
  explicit WallClockTimeOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...) {}

  bool RunOnDevice() override {
    int64_t nanoseconds = static_cast<long int>(
        std::chrono::duration_cast<std::chrono::nanoseconds>(
            std::chrono::high_resolution_clock::now().time_since_epoch())
            .count());

    TensorCPU* output = Output(0);
    output->Resize();
    *output->template mutable_data<int64_t>() = nanoseconds;

    return true;
  }
};

const char kPrintFileExtension[] = ".log";

template <class Context>
class PrintOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_DISPATCH_HELPER;
  explicit PrintOp(const OperatorDef& operator_def, Workspace* ws)
      : Operator<Context>(operator_def, ws),
        tensor_printer_(
            operator_def.input(0),
            this->template GetSingleArgument<int>("to_file", 0)
                ? ws->RootFolder() + "/" + operator_def.input(0) +
                    kPrintFileExtension
                : "",
            this->template GetSingleArgument<int>("limit", 0)),
        every_n_(this->template GetSingleArgument<int>("every_n", 1)) {
    CAFFE_ENFORCE_GE(every_n_, 1);
  }

  bool RunOnDevice() override {
    if (++occurrences_mod_n_ > every_n_) {
      occurrences_mod_n_ -= every_n_;
    }
    if (occurrences_mod_n_ != 1) {
      return true;
    }

    if (!this->InputIsTensorType(0, Context::GetDeviceType()) &&
        !this->InputIsTensorType(0, CPU)) {
      LOG(INFO) << "Blob of type: "
                << OperatorBase::Inputs().at(0)->meta().name();
      return true;
    }
    // special-case empty tensors since they may have no meta()
    if (Input(0).numel() == 0) {
      tensor_printer_.PrintMeta(Input(0));
      return true;
    }

    using Types = TensorTypes<
        float,
        double,
        int,
        long,
        bool,
        char,
        unsigned char,
        std::string>;

    if (this->InputIsTensorType(0, CPU)) {
      return DispatchHelper<Types>::call(
          this, this->template Input<Tensor>(0, CPU));
    } else {
      return DispatchHelper<Types>::call(this, Input(0));
    }
  }

 private:
  template <typename T>
  bool DoRunWithType() {
    // A simple strategy to copy tensor if needed, and have the tensor pointer
    // pointing to the right instantiation. Note that tensor_copy_if_needed
    // will handle memory deallocation itself so no smart pointer is needed.
    const TensorCPU* tensor;
    Tensor tensor_copy_if_needed(CPU);
    if (this->InputIsTensorType(0, CPU)) {
      tensor = &this->template Input<Tensor>(0, CPU);
    } else {
      // sync copy
      tensor_copy_if_needed.CopyFrom(Input(0));
      tensor = &tensor_copy_if_needed;
    }
    tensor_printer_.Print<T>(*tensor);
    return true;
  }

 private:
  TensorPrinter tensor_printer_;
  int every_n_;
  int occurrences_mod_n_{0};
};

/**
 * @brief Alias op makes the output and the input share the same underlying
 * storage.
 *
 * WARNING: in general, in caffe2's operator interface different tensors should
 * have different underlying storage, which is the assumption made by
 * components such as the dependency engine and memory optimization. Thus, in
 * normal situations you should not use the AliasOp, especially in a normal
 * forward-backward pass.
 *
 * The Alias op is provided so one can achieve true asynchrony, such as
 * Hogwild, in a graph. But make sure you understand all the implications
 * similar to multi-thread computation before you use it explicitly.
 */
template <class Context>
class AliasOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(AliasOp);

  bool RunOnDevice() override {
    auto& input = Input(0);
    CAFFE_ENFORCE_GE(input.numel(), 0, "Tensor is not initialized");
    OutputTensorAlias(0, input);
    return true;
  }
};

/**
 * @brief Pass inputs to outputs.
 * Input:
 *   DATA - dense tensor.
 * Output:
 *   DATA - same tensor as input.
 */
template <class Context>
class EnsureDenseOp final : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(EnsureDenseOp)

  bool RunOnDevice() override {
    const auto& input = Input(0);
    auto* output = Output(0);
    CAFFE_ENFORCE_GT(input.dim(), 0, "Input has to be at least a vector.");
    // it is allowed to have the output inplace overwrite the input but also
    // allow the output to be copied from the input
    if (&input != output) {
      output->ResizeLike(input);
      output->CopyFrom(input, true /*async*/);
    }
    return true;
  }
};

template <class Context>
class FlattenToVecOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(FlattenToVecOp);

  bool RunOnDevice() override {
    auto& input = Input(0);
    auto* output = Output(0);
    CAFFE_ENFORCE_GE(input.dim(), 1, "The rank of the tensor must be >= 1.");
    output->Resize(input.numel());

    context_.CopyItemsSameDevice(
        input.dtype(),
        input.numel(),
        input.raw_data(),
        output->raw_mutable_data(input.dtype()));
    return true;
  }
};

// Output gets the data of input(0), but reshapes it like input(1).
template <class Context>
class ResizeLikeOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(ResizeLikeOp);

  bool RunOnDevice() override {
    auto& input0 = Input(0);
    auto& input1 = Input(1);
    auto* output = Output(0);
    CAFFE_ENFORCE_EQ(input0.numel(), input1.numel());
    output->ResizeLike(Input(1));
    context_.CopyItemsSameDevice(
        input0.dtype(),
        input0.numel(),
        input0.raw_data(),
        output->raw_mutable_data(input0.dtype()));
    return true;
  }
};

template <class Context>
class SumOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(SumOp);

  template <typename T>
  bool DoRunWithType() {
    auto& input0 = Input(0);

    if (InputSize() == 1) {
      // TODO: better TensorOptions argument passing(e.g. default argument)
      OutputTensorCopyFrom(
          0,
          // I'll change the order of argument in another diff, so that we don't
          // need to write this
          at::dtype(input0.dtype()),
          input0,
          true /*async*/);
      return true;
    }
    auto* output = Output(0, input0.sizes(), at::dtype<T>());
    T* output_data = output->template mutable_data<T>();
    // Dimension checking
    for (int i = 1; i < InputSize(); ++i) {
      if (output->sizes() != Input(i).sizes()) {
        CAFFE_THROW(
            "Check failed: output->sizes() == Input(i).sizes().",
            "Description: Input #",
            i,
            ", input dimension:",
            Input(i).sizes(),
            " should match output dimension: ",
            output->sizes());
      }
    }

    // Add the first two - works if in-place or not.
    math::Add(
        output->numel(),
        input0.template data<T>(),
        Input(1).template data<T>(),
        output_data,
        &context_);
    // Add remaining.
    for (int i = 2; i < InputSize(); ++i) {
      math::Add(
          output->numel(),
          output_data,
          Input(i).template data<T>(),
          output_data,
          &context_);
    }
    return true;
  }

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<float, double, int32_t, int64_t>>::call(
        this, Input(0));
  }
};

inline OpSchema::Cost CostInferenceForSum(
    const OperatorDef& def,
    const std::vector<TensorShape>& in) {
  struct OpSchema::Cost cost = PointwiseCostInference<1>(def, in);
  cost.flops *= (in.size() - 1);
  cost.params_bytes = 0;
  return cost;
}

// WeightedSumOp computes the weighted sum of several tensors. The input should
// be in the form X_0, weight_0, X_1, weight_1, ... where X_i all have the same
// shape, and weight_i are size 1 tensors that specifies the weight of each
// vector. Note that if one wants to do in-place computation, it could only be
// done with X_0 also as the output, but not other X_i.
template <class Context>
class WeightedSumOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(WeightedSumOp);

  bool RunOnDevice() override;

  template <typename T>
  bool DoRunWithType() {
    // the code is written this way because of 10.1 + gcc 7.3.1 compiler bug
    // as discussed at
    // https://devtalk.nvidia.com/default/topic/1048037/linux/cuda-10-1-nvidia-you-re-now-quot-fixing-quot-gcc-bugs-that-gcc-doesn-t-even-have/
    const int input_size = (*this).InputSize();
    CAFFE_ENFORCE_EQ(input_size % 2, 0);
    const auto& X0 = Input(0);
    const auto& weight0 = Input(1);
    CAFFE_ENFORCE_GT(X0.numel(), 0);
    CAFFE_ENFORCE_EQ(weight0.numel(), 1);
    const int size = X0.numel();
    // Note: removed Aliasing check, since Output already has
    // caching capability
    auto* Y = Output(0, X0.sizes(), at::dtype<T>());
    T* Y_data = Y->template mutable_data<T>();
    if (input_size == 2) {
      math::Scale<float, T>(
          size,
          weight0.template data<float>(),
          X0.template data<T>(),
          Y_data,
          &context_);
      return true;
    }
    const auto& X1 = Input(2);
    CAFFE_ENFORCE(
        !IsInputOutputAlias(2, 0),
        "Input #2 is the same as output. If you want to do in-place updates, "
        "put the output as input #0.");
    const auto& weight1 = Input(3);
    CAFFE_ENFORCE_EQ(X1.numel(), size);
    CAFFE_ENFORCE_EQ(weight1.numel(), 1);
    if (!IsInputOutputAlias(0, 0)) {
      context_.template CopySameDevice<T>(size, X0.template data<T>(), Y_data);
    }
    math::Axpby<float, T, Context>(
        size,
        weight1.template data<float>(),
        X1.template data<T>(),
        weight0.template data<float>(),
        Y_data,
        &context_);
    for (int i = 4; i < input_size; i += 2) {
      const auto& Xi = Input(i);
      // Do a check: if the input is the same as output, we have a problem -
      // in-place update should always only happen with the zeroth input.
      const std::string err_msg = "Input #" + to_string(i) +
          " is the same as output. If you want to do in-place updates, "
          "put the output as input #0.";
      CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg);
      const auto& weighti = Input(i + 1);
      CAFFE_ENFORCE_EQ(Xi.numel(), size);
      CAFFE_ENFORCE_EQ(weighti.numel(), 1);
      math::Axpy<float, T, Context>(
          size,
          weighti.template data<float>(),
          Xi.template data<T>(),
          Y_data,
          &context_);
    }
    return true;
  }
};

template <class Context>
class WeightedSumGradientOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;

  template <class... Args>
  explicit WeightedSumGradientOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...),
        grad_on_w_(this->template GetSingleArgument<bool>("grad_on_w", false)) {
  }

  template <typename DstType>
  bool DoRunWithType() {
    CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
    auto output_size = grad_on_w_ ? InputSize() - 1 : InputSize() / 2;
    CAFFE_ENFORCE_EQ(OutputSize(), output_size);

    auto& dY = Input(0);
    const auto* dY_data = dY.template data<DstType>();
    int size = dY.numel();

    // The input size should be the input size of the forward op plus 1
    for (int i = 0; i < InputSize() / 2; i++) {
      auto& cur_w = Input(2 * i + 2);
      CAFFE_ENFORCE_EQ(cur_w.numel(), 1);

      auto* cur_dX = Output(i, dY.sizes(), at::dtype<DstType>());

      math::Scale<float, DstType, Context>(
          size,
          cur_w.template data<float>(),
          dY_data,
          cur_dX->template mutable_data<DstType>(),
          &context_);

      if (grad_on_w_) {
        auto& cur_X = Input(2 * i + 1);
        CAFFE_ENFORCE_EQ(cur_X.numel(), size);
        auto* cur_dw = Output(i + output_size / 2);
        cur_dw->Resize(1);
        math::Dot<DstType, Context>(
            size,
            dY_data,
            cur_X.template data<DstType>(),
            cur_dw->template mutable_data<float>(),
            &context_);
      }
    }

    return true;
  }

  bool RunOnDevice() override;

 private:
  bool grad_on_w_;
};

/**
 * @brief Update slices of the tensor in-place with weighted sum.
 *
 * ScatterWeightedSumOp is similar to WeightedSum and computes the weighted sum
 * of several tensors. The first tensor has to be in-place and only slices of it
 * on the first dimension as indexed by INDICES will be updated.
 *
 * Input:
 *   X_0 - tensor to be updated
 *   weight_0 - scalar weight for X_0, applied only to slices affected,
 *   INDICES - 1-D list of indices on the first dimension of X_0 that need to be
 * updated
 *   X_1 - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
 *   weight_1 - scalar weight for X_1 update
 *   X_2, weight_2, ...
 *
 * Output:
 *   X_0 - has to be exactly the same tensor as the input 0
 *
 * Note: The op pretty much ignores the exact shapes of the input arguments and
 * cares only about sizes. It's done for performance consideration to avoid
 * unnecessary reshapes. Only first dimension of X_0 is important, let's call it
 * N. If M is the total size of X_0 and K is the size of INDICES then X_i is
 * assumed to be of shape K x (M / N) regardless of the real shape.
 *
 * Note: Each update in INDICES is applied independently which means that if
 * duplicated elements are present in INDICES the corresponding slice of X_0
 * will be scaled multiple times. Manual collapsing of INDICES is required
 * beforehand if necessary.
 *
 * Note: Updates are applied sequentially by inputs which might have undesired
 * consequences if the input tensor is accessed concurrently by different op
 * (e.g. when doing Hogwild). Other threads might see intermediate results even
 * on individual slice level, e.g. X_0 scaled by weight_0 but without any
 * updates applied.
 *
 * For now really works only on CPU because of INDICES access
 */
template <typename T, class Context>
class ScatterWeightedSumOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(ScatterWeightedSumOp);
  USE_DISPATCH_HELPER;

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(2));
  }

 private:
  template <typename Index>
  bool DoRunWithType() {
    int64_t block_size = Input(0).size_from_dim(1);
    return DispatchHelper<FixedValues<1>, Index>::call(this, block_size);
  }

  template <typename Index, int FixedSize>
  bool DoRunWithValue() {
    CAFFE_ENFORCE_EQ(InputSize() % 2, 1);
    auto& X0 = Input(0);
    auto& weight0 = Input(1);
    auto& indices = Input(2);
    auto* output = Output(0);
    CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required");

    CAFFE_ENFORCE_GT(X0.numel(), 0);
    CAFFE_ENFORCE_GT(X0.dim(), 0, "X0 has to be at least the vector");
    CAFFE_ENFORCE_EQ(weight0.numel(), 1);
    int64_t M = X0.numel();
    int64_t N = X0.size(0);
    int64_t K = indices.numel();
    int64_t block_size = M / N;
    T* data = output->template mutable_data<T>();
    const Index* idxs = indices.template data<Index>();
    T w0 = *weight0.template data<T>();
    // It's most likely a constant so exact comparison is fine
    if (w0 != 1.0) {
      for (int i = 0; i < K; ++i) {
        Index idx = idxs[i];
        CAFFE_ENFORCE(
            0 <= idx && idx < N,
            "Index out of bounds: ",
            idx,
            ", range 0 to ",
            N);
        math::ScaleFixedSize<T, Context, FixedSize>(
            block_size,
            w0,
            data + block_size * idx,
            data + block_size * idx,
            &context_);
      }
    }
    for (int inp = 3; inp < InputSize(); inp += 2) {
      auto& X = Input(inp);
      auto& weight = Input(inp + 1);
      CAFFE_ENFORCE_EQ(X.numel(), block_size * K);
      CAFFE_ENFORCE_EQ(weight.numel(), 1);
      const T* x_data = X.template data<T>();
      T w = *weight.template data<T>();
      for (int i = 0; i < K; ++i) {
        Index idx = idxs[i];
        // double-checking the indices, but it's fine as it's DCHECK only
        DCHECK(0 <= idx && idx < N)
            << "Index out of bounds: " << idx << ", range 0 to " << N;
        math::AxpyFixedSize<T, Context, FixedSize>(
            block_size,
            w,
            x_data + block_size * i,
            data + block_size * idx,
            &context_);
      }
    }
    return true;
  }
  Tensor x_data_host_;
  Tensor weights_host_;
  Tensor x_data_device_;
  Tensor weights_device_;
};

/**
 * @brief Update slices of the tensor in-place by overriding.
 *
 * Input:
 *   DATA - tensor to be updated
 *   INDICES - 1-D list of indices on the first dimension of X_0 that need to be
 *             updated
 *   SLICES - update slices, has to have shape of len(INDICES) + shape(X_0)[1:]
 *
 * Output:
 *   DATA - has to be exactly the same tensor as the input 0
 *
 * Note: The op pretty much ignores the exact shapes of the input arguments and
 * cares only about sizes. It's done for performance consideration to avoid
 * unnecessary reshapes. Only first dimension of X_0 is important, let's call it
 * N. If M is the total size of X_0 and K is the size of INDICES then X_i is
 * assumed to be of shape K x (M / N) regardless of the real shape.
 *
 * Note: Each update in INDICES is applied independently which means that if
 * duplicated elements are present in INDICES arbitrary one will win.
 *
 * For now really works only on CPU because of INDICES access
 */
template <class Context>
class ScatterAssignOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  virtual ~ScatterAssignOp() {}

  template <class... Args>
  explicit ScatterAssignOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...),
        runners_({{{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT},
                   &ScatterAssignOp::DoRun<int32_t, float>},
                  {{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT16},
                   &ScatterAssignOp::DoRun<int32_t, at::Half>},
                  {{TensorProto_DataType_INT32, TensorProto_DataType_UINT8},
                   &ScatterAssignOp::DoRun<int32_t, uint8_t>},
                  {{TensorProto_DataType_INT32, TensorProto_DataType_INT32},
                   &ScatterAssignOp::DoRun<int32_t, int32_t>},
                  {{TensorProto_DataType_INT32, TensorProto_DataType_INT64},
                   &ScatterAssignOp::DoRun<int32_t, int64_t>},
                  {{TensorProto_DataType_INT32, TensorProto_DataType_DOUBLE},
                   &ScatterAssignOp::DoRun<int32_t, double>},
                  {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT},
                   &ScatterAssignOp::DoRun<int64_t, float>},
                  {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT16},
                   &ScatterAssignOp::DoRun<int64_t, at::Half>},
                  {{TensorProto_DataType_INT64, TensorProto_DataType_UINT8},
                   &ScatterAssignOp::DoRun<int64_t, uint8_t>},
                  {{TensorProto_DataType_INT64, TensorProto_DataType_INT32},
                   &ScatterAssignOp::DoRun<int64_t, int32_t>},
                  {{TensorProto_DataType_INT64, TensorProto_DataType_INT64},
                   &ScatterAssignOp::DoRun<int64_t, int64_t>},
                  {{TensorProto_DataType_INT64, TensorProto_DataType_DOUBLE},
                   &ScatterAssignOp::DoRun<int64_t, double>}}) {}

  bool RunOnDevice() override {
    const auto& data = Input(DATA);
    const auto& slices = Input(SLICES);
    auto& indices = Input(INDICES);

    const auto dataType = TypeMetaToDataType(data.dtype());
    const auto slicesType = TypeMetaToDataType(slices.dtype());
    const auto indicesType = TypeMetaToDataType(indices.dtype());
    auto* output = Output(0);

    auto runner = GetRunner(dataType, slicesType, indicesType);
    (this->*runner)();
    return true;
  }

 private:
  typedef void (ScatterAssignOp::*RunnerType)();
  typedef std::
      map<std::pair<TensorProto_DataType, TensorProto_DataType>, RunnerType>
          RunnerMap;

  RunnerMap runners_;

  RunnerType GetRunner(
      const TensorProto_DataType dataType,
      const TensorProto_DataType slicesType,
      const TensorProto_DataType indicesType) {
    CAFFE_ENFORCE_EQ(dataType, slicesType, "Data and slice types must match");
    auto it = runners_.find({indicesType, dataType});
    CAFFE_ENFORCE(
        it != runners_.end(),
        "Could not find the runner corresponding to indicesType, dataType = ",
        indicesType,
        " ",
        dataType);
    return it->second;
  }

  template <typename Index, typename T>
  void DoRun() {
    auto& input = Input(DATA);
    auto& indices = Input(INDICES);
    auto& slices = Input(SLICES);
    auto* output = Output(0);
    CAFFE_ENFORCE_EQ(&input, output, "In place operation is required");

    CAFFE_ENFORCE_GT(input.dim(), 0, "X0 has to be at least the vector");
    int64_t M = input.numel();
    int64_t N = input.size(0);
    int64_t K = indices.numel();
    int64_t block_size = M / N;
    CAFFE_ENFORCE_EQ(slices.numel(), block_size * K);
    // TODO(dzhulgakov): it can be made to work with arbitrary data type by
    // using raw_mutable_data
    T* data = output->template mutable_data<T>();
    const Index* idxs = indices.template data<Index>();
    const T* slicesData = slices.template data<T>();
    DoScatterAssign(data, idxs, slicesData, N, K, block_size);
  }

  template <typename Index, typename T>
  void DoScatterAssign(
      T* data,
      const Index* idxs,
      const T* slicesData,
      int64_t N,
      int64_t K,
      int64_t block_size) {
    for (int i = 0; i < K; ++i) {
      Index idx = idxs[i];
      // double-checking the indices, but it's fine as it's DCHECK only
      DCHECK(0 <= idx && idx < N)
          << "Index out of bounds: " << idx << ", range 0 to " << N;
      context_.template CopySameDevice<T>(
          block_size, slicesData + block_size * i, data + block_size * idx);
    }
  }

  INPUT_TAGS(DATA, INDICES, SLICES);
};

template <class Context>
class ScatterOp : public Operator<CPUContext> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;

  template <class... Args>
  explicit ScatterOp(Args&&... args)
      : Operator<CPUContext>(std::forward<Args>(args)...),
        OP_SINGLE_ARG(int, "axis", axis_, 1) {}

  virtual ~ScatterOp() noexcept override {}

  bool RunOnDevice() override {
    TORCH_CHECK(
        Context::GetDeviceType() == kCPU,
        "ScatterOp currently only supports CPU.")

    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
        this, this->template Input<Tensor>(INDICES, CPU));
  }

  template <typename IndexType>
  bool DoRunWithType() {
    const Tensor& data = Input(DATA);
    const Tensor& indices = Input(INDICES);
    const Tensor& updates = Input(UPDATES);
    const TypeMeta dataType = data.dtype();
    size_t item_bytesize = dataType.itemsize();

    // ONNX allows negative axis to index from the back, valid range: [-r, r].
    axis_ = data.canonical_axis_index(axis_);

    CAFFE_ENFORCE_GE(
        data.dim(), axis_ + 1, "DATA should be at least [axis+1]-D");
    CAFFE_ENFORCE_GE(axis_, 0, "Axis should be non-negative");
    CAFFE_ENFORCE_LT(axis_, data.dim(), "Axis out of range");

    Tensor* output = Output(0, data.sizes().vec(), at::dtype(dataType));
    output->CopyFrom(data);
    char* out = static_cast<char*>(output->raw_mutable_data(dataType));

    // Succeed if size of output is zero, which can happen for empty batch which
    // would have data dimension size of 0.
    // This *must* be done AFTER output->raw_mutable_data() above as that has
    // important allocation side effect that we must see.
    if (output->numel() == 0) {
      return true;
    }

    const IndexType* idxs = indices.template data<IndexType>();
    const char* src_base = static_cast<const char*>(updates.raw_data());

    const int64_t outer_dims_product = indices.size_to_dim(axis_);

    const int64_t dst_indexing_axis_dim = data.size(axis_);

    const int64_t idxs_block_size = indices.size_from_dim(axis_ + 1);
    const int64_t src_block_size = updates.size_from_dim(axis_ + 1);
    const int64_t dst_block_size = data.size_from_dim(axis_ + 1);

    const int64_t idxs_batch_size = indices.size_from_dim(axis_);
    const int64_t src_batch_size = updates.size_from_dim(axis_);
    const int64_t dst_batch_size = data.size_from_dim(axis_);

    const int64_t N = indices.size(axis_);

    check_indexarray_range<IndexType>(idxs, N, dst_indexing_axis_dim);

    // For a 3-D tensor, dst is updated as:
    //    dst[i][idxs[i][j][k]][k] = src[i][j][k]  # if dim == 1
    // where i, j, k are iterating over their corresponding axis I, J, K.
    // For a given i, j, k tuple.
    // idxs offset can be computed as i * J_src * K + j * K + k.
    // src offset can be computed as i * J_src * K + j * K + k.
    // dst offset can be computed as i * J_dst * K + idxs[idxs_offset] * K + K
    // Note that idxs and src should have the same rank and shape.
    // dst should have the same rank as idxs and src, but the dimension of dim
    // axis can be different. That is why in the above equation, there is the
    // difference of J_src and J_dst.
    for (int64_t outer_batch = 0; outer_batch < outer_dims_product;
         ++outer_batch) {
      for (int64_t i = 0; i < N; ++i) {
        for (int64_t inner_batch = 0; inner_batch < idxs_block_size;
             ++inner_batch) {
          auto idxs_elem_idx =
              outer_batch * idxs_batch_size + i * idxs_block_size + inner_batch;
          auto src_elem_idx =
              outer_batch * src_batch_size + i * src_block_size + inner_batch;
          auto dst_elem_idx = outer_batch * dst_batch_size +
              idxs[idxs_elem_idx] * dst_block_size + inner_batch;

          auto src = src_base + src_elem_idx * item_bytesize;
          auto dst = out + dst_elem_idx * item_bytesize;
          context_.CopyItemsSameDevice(dataType, 1, src, dst);
        }
      }
    }
    return true;
  }

  INPUT_TAGS(DATA, INDICES, UPDATES);

  // Check that indices fall within dimension array size with CAFFE_ENFORCE.
  template <typename IndexType>
  static void check_indexarray_range(
      const IndexType* indices,
      int64_t n,
      IndexType indexing_axis_dim) {
    for (auto i = 0; i < n; ++i) {
      auto idx = indices[i];
      CAFFE_ENFORCE(
          0 <= idx && idx < indexing_axis_dim,
          "INDICES element is out of DATA bounds, id=",
          idx,
          " axis_dim=",
          indexing_axis_dim);
    }
  }

 protected:
  int axis_;
};

template <class Context>
class LengthsToSegmentIdsOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp);

  bool RunOnDevice() override {
    auto& input = Input(0);
    auto* output = Output(0);
    auto* input_data = input.template data<int32_t>();

    CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector.");
    auto total_length =
        std::accumulate(input_data, input_data + input.numel(), 0);

    output->Resize(total_length);
    auto* output_data = output->template mutable_data<int32_t>();

    for (int i = 0; i < input.numel(); ++i) {
      auto len = input_data[i];
      std::fill(output_data, output_data + len, i);
      output_data += len;
    }
    return true;
  }
};

template <class Context>
class LengthsToRangesOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp);

  bool RunOnDevice() override {
    auto& input = Input(0);
    auto* output = Output(0);
    auto* input_data = input.template data<int32_t>();

    CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector.");
    auto size = input.numel();

    output->Resize(size, 2);
    auto* output_data = output->template mutable_data<int32_t>();

    int32_t offset = 0;
    for (int i = 0; i < size; ++i) {
      auto len = input_data[i];
      output_data[i * 2] = offset;
      output_data[i * 2 + 1] = len;
      offset += len;
    }
    return true;
  }
};

template <class Context>
class SegmentIdsToLengthsOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp);

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
  }

  template <typename Index>
  bool DoRunWithType() {
    auto& input = Input(0);
    if (input.dim() == 2) {
      CAFFE_ENFORCE(
          input.dim32(0) == 1 || input.dim32(1) == 1,
          "Input must be a vector.");
    } else {
      CAFFE_ENFORCE_EQ(input.dim(), 1, "Input must be a vector.");
    }
    auto* input_data = input.template data<Index>();
    auto input_size = input.numel();
    auto* output = Output(0);
    // segment id starts from 0
    auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
    if (InputSize() > 1) {
      CAFFE_ENFORCE_GE(Input(1).dim(), 1);
      CAFFE_ENFORCE_LE(
          num_segments,
          Input(1).size(0),
          "The number of segments inferred should *NOT* be larger "
          "than the size of Input(1)'s first dimension");
      num_segments = Input(1).size(0);
    }
    CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
    output->Resize(num_segments);
    auto* output_data = output->template mutable_data<int32_t>();
    if (num_segments == 0) {
      return true;
    }
    std::fill(output_data, output_data + num_segments, 0);
    Index prev = 0; // Assume that segment_id >= 0.
    for (int64_t i = 0; i < input_size; i++) {
      CAFFE_ENFORCE(
          prev <= input_data[i],
          "Segment ids must be sorted: ",
          prev,
          " vs ",
          input_data[i]);
      prev = input_data[i];
      output_data[input_data[i]] += 1;
    }

    return true;
  }
};

template <class Context>
class SegmentIdsToRangesOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(SegmentIdsToRangesOp);

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
  }

  template <typename Index>
  bool DoRunWithType() {
    auto& input = Input(0);
    CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector.");
    auto* input_data = input.template data<Index>();
    auto input_size = input.numel();
    auto* output = Output(0);
    // segment id starts from 0
    auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0;
    if (InputSize() > 1) {
      CAFFE_ENFORCE_GE(Input(1).dim(), 1);
      CAFFE_ENFORCE_LE(
          num_segments,
          Input(1).size(0),
          "The number of segments inferred should *NOT* be larger "
          "than the size of Input(1)'s first dimension");
      num_segments = Input(1).size(0);
    }
    CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range");
    output->Resize(num_segments, 2);
    auto* output_data = output->template mutable_data<int32_t>();
    if (num_segments == 0) {
      return true;
    }
    std::fill(output_data, output_data + num_segments * 2, 0);
    Index prev = input_data[0];
    for (int64_t i = 0; i < input_size; i++) {
      CAFFE_ENFORCE(
          prev <= input_data[i],
          "Segment ids must be sorted: ",
          prev,
          " vs ",
          input_data[i]);
      while (prev != input_data[i]) {
        ++prev;
        output_data[prev * 2] = i;
      }
      output_data[input_data[i] * 2 + 1] += 1;
    }

    return true;
  }
};

template <class Context>
class LengthsToWeightsOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  template <class... Args>
  explicit LengthsToWeightsOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...),
        power_(this->template GetSingleArgument<float>("power", 0.5)) {}

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0));
  }

  template <typename Index>
  bool DoRunWithType() {
    auto& input = Input(0);
    CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector.");
    auto* input_data = input.template data<Index>();
    auto input_size = input.numel();
    auto* output = Output(0);

    int64_t output_size = 0;
    for (auto i = 0; i < input_size; i++) {
      CAFFE_ENFORCE_GE(input_data[i], 0, "unexpected negative length value");
      output_size += input_data[i];
    }

    std::function<float(const int64_t& length, const float& power)> getWeight;
    if (power_ == 0.5) {
      getWeight = [](const int64_t& length, const float& /*power*/) {
        return 1.0 / std::sqrt(length);
      };
    } else if (power_ == 1) {
      getWeight = [](const int64_t& length, const float& /*power*/) {
        return 1.0 / length;
      };
    } else {
      getWeight = [](const int64_t& length, const float& power) {
        return 1.0 / std::pow(length, power);
      };
    }

    output->Resize(output_size);
    auto* output_data = output->template mutable_data<float>();
    int64_t cnt = 0;
    for (auto i = 0; i < input_size; i++) {
      auto len = input_data[i];
      if (len == 0) {
        continue;
      }
      CAFFE_ENFORCE_LE(cnt + len, output_size, "unexpected lengths value");

      float weight_value = getWeight(len, power_);
      std::fill(output_data + cnt, output_data + cnt + len, weight_value);
      cnt += len;
    }

    return true;
  }

 private:
  float power_;
};

template <class Context>
class HasElementsOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(HasElementsOp);

  bool RunOnDevice() override {
    bool res = false;
    for (auto i = 0; i < InputSize(); ++i) {
      const auto& input = Input(i);
      res = res || input.numel() > 0;
    }
    auto* output = Output(0);
    output->Resize(std::vector<int64_t>{});
    *output->template mutable_data<bool>() = res;
    return true;
  }
};

// Return the size of a tensor
template <class Context>
class SizeOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(SizeOp);

  bool RunOnDevice() override {
    auto& input = Input(0);

    auto* output = Output(0, vector<int64_t>(), at::dtype<int64_t>());
    auto* output_data = output->template mutable_data<int64_t>();

    auto size = input.numel();
    math::Set<int64_t, Context>(
        1, static_cast<int64_t>(size), output_data, &context_);

    return true;
  }
};

// returns a shape to be passed to Reshape
template <class Context>
class LengthsToShapeOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp);

  bool RunOnDevice() override {
    auto& input = Input(0);

    CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector.");
    auto* output = Output(0);
    auto* input_data = input.template data<int32_t>();

    auto size = input.numel();
    auto first = input_data[0];

    for (int i = 1; i < size; i++) {
      CAFFE_ENFORCE(
          input_data[i] == first, "All elements of input must be same ");
    }

    output->Resize(2);
    auto* output_data = output->template mutable_data<int32_t>();
    output_data[0] = size;
    output_data[1] = first;

    return true;
  }
};

template <class Context>
class GatherRangesOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(GatherRangesOp);

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
        this, this->template Input<Tensor>(RANGES, CPU));
  }

  template <typename Index>
  bool DoRunWithType() {
    auto& data = Input(DATA);
    auto& ranges = Input(RANGES);
    auto* outputData = Output(0);
    auto* outputLengths = Output(1);

    auto batchSize = ranges.size(0);
    CAFFE_ENFORCE(data.dim() == 1, "Data has to be 1-D");
    CAFFE_ENFORCE(ranges.dim() == 3, "Ranges must be 3-D");
    CAFFE_ENFORCE(ranges.size(1) > 0, "There has to be at least one range");
    CAFFE_ENFORCE_EQ(
        ranges.size(2), 2, "Ranges last dimension should be of size 2");

    auto* rawData = static_cast<const char*>(data.raw_data());
    auto* rangesData = ranges.template data<Index>();

    outputLengths->Resize(batchSize);
    auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>();
    size_t start = 0;
    size_t blockSize = ranges.size_from_dim(1);
    for (size_t i = 0; i < batchSize; ++i) {
      auto end = start + blockSize;
      outputLengthsPtr[i] = accumulate(rangesData, start, end);
      start = end;
    }

    size_t outputSize = accumulate(rangesData, 0, ranges.numel());
    outputData->Resize(outputSize);

    auto outputRawData =
        static_cast<char*>(outputData->raw_mutable_data(data.dtype()));
    VLOG(1) << "Copying data";
    size_t outputOffsetBytes = 0;
    auto itemsize = data.dtype().itemsize();
    for (int i = 0; i < ranges.numel(); i += 2) {
      auto rangeStart = rangesData[i];
      auto rangeLength = rangesData[i + 1];
      if (!rangeLength) {
        continue;
      }
      auto rangeSizeBytes = rangeLength * itemsize;
      CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize);
      CAFFE_ENFORCE(rangeStart + rangeLength <= data.numel());
      context_.CopyItemsSameDevice(
          data.dtype(),
          rangeLength,
          rawData + rangeStart * itemsize,
          outputRawData + outputOffsetBytes);
      outputOffsetBytes += rangeSizeBytes;
    }
    CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize);
    return true;
  }

  INPUT_TAGS(DATA, RANGES, LENGTHS);

 private:
  template <typename Index>
  size_t accumulate(Index* ranges, size_t start, size_t end) {
    size_t result = 0;
    for (size_t i = start + 1; i < end; i += 2) {
      result += ranges[i];
    }
    return result;
  }
};

template <class Context>
class LengthsGatherOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(LengthsGatherOp);

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
        this, this->template Input<Tensor>(INDICES, CPU));
  }

  template <typename Index>
  bool DoRunWithType() {
    auto& items = Input(ITEMS);
    auto& lengths = Input(LENGTHS);
    auto& indices = Input(INDICES);
    auto* output = Output(0);

    CAFFE_ENFORCE_GE(items.dim(), 1, "ITEMS should be at least 1-D");
    CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS should be 1-D");
    CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES should be 1-D");

    const auto* lengths_data = lengths.template data<int32_t>();
    const auto* indices_data = indices.template data<Index>();

    int64_t total_length = 0;
    for (size_t i = 0; i < indices.numel(); ++i) {
      auto idx = indices_data[i];
      CAFFE_ENFORCE_LT(idx, lengths.numel());
      total_length += lengths_data[idx];
    }
    auto shape = items.sizes().vec();
    shape[0] = total_length;
    output->Resize(shape);

    offsets_.clear();
    int64_t running_offset = 0;
    offsets_.reserve(lengths.numel());
    for (size_t i = 0; i < lengths.numel(); ++i) {
      offsets_.push_back(running_offset);
      running_offset += lengths_data[i];
    }
    CAFFE_ENFORCE_EQ(
        items.size(0),
        running_offset,
        "LENGTHS must match the first dimension of ITEMS");

    auto src_base = static_cast<const char*>(items.raw_data());
    auto block_size = items.size_from_dim(1);
    auto block_bytesize = block_size * items.itemsize();
    auto out = static_cast<char*>(output->raw_mutable_data(items.dtype()));

    for (size_t i = 0; i < indices.numel(); ++i) {
      auto idx = indices_data[i];
      auto length = lengths_data[idx];
      context_.CopyItemsSameDevice(
          items.dtype(),
          length * block_size,
          src_base + offsets_[idx] * block_bytesize,
          out);
      out += length * block_bytesize;
    }
    return true;
  }

  std::vector<int64_t> offsets_;

  INPUT_TAGS(ITEMS, LENGTHS, INDICES);
};

template <typename T, class Context>
class AccumulateHistogramOp : public Operator<Context> {
 public:
  template <class... Args>
  explicit AccumulateHistogramOp(Args&&... args)
      : Operator<Context>(std::forward<Args>(args)...),
        lower_bound_(
            this->template GetSingleArgument<float>("lower_bound", 0.0)),
        upper_bound_(
            this->template GetSingleArgument<float>("upper_bound", 1.0)),
        num_buckets_(this->template GetSingleArgument<int>("num_buckets", 1)) {
    CAFFE_ENFORCE_GT(num_buckets_, 0);
    // 2 more for histograms < lower_bound, >= upper_bound respectively
    num_output_buckets_ = num_buckets_ + 2;
    accumulate_hist_ = std::vector<int64_t>(num_output_buckets_, 0);
  }

  USE_OPERATOR_CONTEXT_FUNCTIONS;

  bool RunOnDevice() override {
    auto& X = Input(X_IN);
    auto* X_data = X.template data<T>();
    int N = X.numel();
    auto* cur_hist = Output(CUR_HIST);
    auto* acc_hist = Output(ACC_HIST);
    cur_hist->Resize(num_output_buckets_);
    acc_hist->Resize(num_output_buckets_);
    auto* cur_hist_data = cur_hist->template mutable_data<int64_t>();
    auto* acc_hist_data = acc_hist->template mutable_data<int64_t>();
    auto segment = (upper_bound_ - lower_bound_) / num_buckets_;
    math::Set<int64_t, Context>(
        num_output_buckets_, 0, cur_hist_data, &context_);

    for (int i = 0; i < N; i++) {
      int bucket_index = -1;
      if (X_data[i] < lower_bound_) {
        bucket_index = 0;
      } else if (X_data[i] >= upper_bound_) {
        bucket_index = num_buckets_ + 1;
      } else {
        bucket_index = (int)((X_data[i] - lower_bound_) / segment) + 1;
      }
      cur_hist_data[bucket_index] += 1;
      accumulate_hist_[bucket_index] += 1;
    }

    for (int i = 0; i < num_output_buckets_; i++) {
      acc_hist_data[i] = accumulate_hist_[i];
    }

    return true;
  }

 private:
  float lower_bound_;
  float upper_bound_;
  int num_buckets_;
  int num_output_buckets_;
  std::vector<int64_t> accumulate_hist_;

  INPUT_TAGS(X_IN);
  OUTPUT_TAGS(CUR_HIST, ACC_HIST);
};

template <class Context>
class RangeOp : public Operator<Context> {
 public:
  USE_OPERATOR_CONTEXT_FUNCTIONS;
  USE_SIMPLE_CTOR_DTOR(RangeOp)

  bool RunOnDevice() override {
    return DispatchHelper<TensorTypes<int32_t, int64_t, float, double>>::call(
        this, Input(0));
  }

  template <typename T>
  T readScalarInput(const int index) {
    if (std::is_same<Context, TensorCPU>::value) {
      return Input(index).template data<T>()[0];
    } else {
      local_.CopyFrom(Input(index));
      return local_.template data<T>()[0];
    }
  }

  template <typename T>
  bool DoRunWithType() {
    T stop = 0;
    T start = 0;
    T step = 1;

    for (int i = 0; i < InputSize(); ++i) {
      CAFFE_ENFORCE_EQ(
          Input(i).numel(), 1, "All inputs must be scalar/1D tensor.");
    }

    switch (InputSize()) {
      case 1:
        stop = readScalarInput<T>(0);
        break;
      case 2:
        start = readScalarInput<T>(0);
        stop = readScalarInput<T>(1);
        break;
      case 3:
        step = readScalarInput<T>(2);
        start = readScalarInput<T>(0);
        stop = readScalarInput<T>(1);
        break;
    }
    CAFFE_ENFORCE_NE(step, 0, "Step size cannot be 0.");
    int length;
    auto diff = stop - start;
    if (std::is_integral<T>::value) {
      // Avoid casting to and from floats in case it introduces rounding and
      // avoid mod because the compiler doesn't strip unused code until later.
      length = diff / step;
      if (length * step < diff) {
        length += 1;
      }
    } else {
      length = static_cast<int>(ceil(diff / step));
    }

    // Match numpy's behavior here.
    if (length <= 0) {
      Output(0, {0}, at::dtype<T>());
      return true;
    } else {
      auto* output = Output(0, {length}, at::dtype<T>());
      return DoRunOnDevice<T>(start, step, output);
    }
  }

  template <typename T>
  bool DoRunOnDevice(const T& start, const T& step, Tensor* output);

 private:
  // local CPU tensor for copying constants.
  Tensor local_{CPU};
};

class ThrowExceptionOp : public Operator<CPUContext> {
 public:
  template <class... Args>
  explicit ThrowExceptionOp(Args&&... args)
      : Operator<CPUContext>(std::forward<Args>(args)...),
        message_(GetSingleArgument<std::string>(
            "message",
            "Exception from ThrowExceptionOp")) {}

  bool RunOnDevice() override {
    CAFFE_THROW(message_);
  }

 private:
  const std::string message_;
};

class ThrowChildThreadExceptionOp : public Operator<CPUContext> {
 public:
  template <class... Args>
  explicit ThrowChildThreadExceptionOp(Args&&... args)
      : Operator<CPUContext>(std::forward<Args>(args)...),
        message_(GetSingleArgument<std::string>(
            "message",
            "Exception from ThrowChildThreadExceptionOp")) {}

  bool RunOnDevice() override {
    std::thread t([this]() { CAFFE_THROW(this->message_); });

    t.join();
    return true;
  }

 private:
  const std::string message_;
};

class LogFatalOp : public Operator<CPUContext> {
 public:
  template <class... Args>
  explicit LogFatalOp(Args&&... args)
      : Operator<CPUContext>(std::forward<Args>(args)...),
        message_(GetSingleArgument<std::string>(
            "message",
            "Logging from LogFatalOp")) {}

  bool RunOnDevice() override {
    LOG(FATAL) << message_;
    return true;
  }

 private:
  const std::string message_;
};

class FailOp : public Operator<CPUContext> {
 public:
  template <class... Args>
  explicit FailOp(Args&&... args)
      : Operator<CPUContext>(std::forward<Args>(args)...) {}

  bool RunOnDevice() override {
    return false;
  }
};

} // namespace caffe2

#endif // CAFFE2_OPERATORS_UTILITY_OPS_H_