pytorch/caffe2/operators/reduction_ops.cc

#include "caffe2/operators/reduction_ops.h"

namespace caffe2 {

REGISTER_CPU_OPERATOR(SumElements, SumElementsOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(SumElementsInt, SumElementsIntOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(SumSqrElements, SumSqrElementsOp<CPUContext>);

REGISTER_CPU_OPERATOR(
    SumElementsGradient,
    SumElementsGradientOp<float, CPUContext>);

REGISTER_CPU_OPERATOR(RowwiseMax, MaxReductionOp<float, CPUContext, true>);
REGISTER_CPU_OPERATOR(
    RowwiseMaxGradient,
    MaxReductionGradientOp<float, CPUContext, true>);
REGISTER_CPU_OPERATOR(
    ColwiseMaxGradient,
    MaxReductionGradientOp<float, CPUContext, false>);
REGISTER_CPU_OPERATOR(ColwiseMax, MaxReductionOp<float, CPUContext, false>);

OPERATOR_SCHEMA(SumElements)
    .NumInputs(1)
    .NumOutputs(1)
    .ScalarType(TensorProto::FLOAT)
    .SetDoc(R"DOC(
Sums the elements of the input tensor. Tensor type must be float32.

Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc

<details>

<summary> <b>Example</b> </summary>

**Code**

```

workspace.ResetWorkspace()

sum_op = core.CreateOperator(
    "SumElements",
    ["X"],
    ["Y"]
)

avg_op = core.CreateOperator(
    "SumElements",
    ["X"],
    ["Y"],
    average=True
)

workspace.FeedBlob("X", np.random.randint(10, size=(3,3)).astype(np.float32))
print("X:\n", workspace.FetchBlob("X"))
workspace.RunOperatorOnce(sum_op)
print("Y (sum_op):", workspace.FetchBlob("Y"))
workspace.RunOperatorOnce(avg_op)
print("Y (avg_op):", workspace.FetchBlob("Y"))

```

**Result**

```

X:
 [[7. 2. 5.]
 [9. 4. 2.]
 [1. 2. 5.]]
Y (sum_op): 37.0
Y (avg_op): 4.111111

```

</details>

    )DOC")
    .Arg("average", "(*bool*): set to True to compute the average of the elements rather than the sum")
    .Input(0, "X", "(*Tensor`<float>`*): blob pointing to an instance of a counter")
    .Output(0, "sum", "(*Tensor`<float>`*): Scalar tensor containing the sum (or average)");

OPERATOR_SCHEMA(SumElementsInt)
    .NumInputs(1)
    .NumOutputs(1)
    .ScalarType(TensorProto::INT32)
    .SetDoc("Sums the integer elements of the input tensor.")
    .Input(0, "X", "Tensor to sum up")
    .Output(0, "sum", "Scalar sum");
SHOULD_NOT_DO_GRADIENT(SumElementsInt);

OPERATOR_SCHEMA(SumSqrElements)
    .NumInputs(1)
    .NumOutputs(1)
    .ScalarType(TensorProto::FLOAT)
    .SetDoc("Sums the squares elements of the input tensor.")
    .Arg("average", "whether to average or not")
    .Input(0, "X", "Tensor to sum up")
    .Output(0, "sum", "Scalar sum of squares");

OPERATOR_SCHEMA(SumElementsGradient).NumInputs(2).NumOutputs(1);

class GetSumElementsGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "SumElementsGradient",
        "",
        vector<string>{I(0), GO(0)},
        vector<string>{GI(0)});
  }
};
REGISTER_GRADIENT(SumElements, GetSumElementsGradient);

OPERATOR_SCHEMA(RowwiseMax)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
Compute row-wise max reduction of the input tensor. This op takes one input, $X$, of shape $BxMxN$, where $B$ is the batch size, $M$ is number of rows, and $N$ is number of columns. The output of this op, $Y$, is a matrix of shape $BxM$, with one row for each element of the batch, and the same number of columns as the number of rows of the input tensor.

Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.h
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc

<details>

<summary> <b>Example</b> </summary>

**Code**

```

workspace.ResetWorkspace()

op = core.CreateOperator(
    "RowwiseMax",
    ["X"],
    ["Y"]
)

// Create X, simulating a batch of 2, 4x4 matricies
X = np.random.randint(0,high=20,size=(2,4,4))
print("X:\n",X)

// Feed X into workspace
workspace.FeedBlob("X", X.astype(np.float32))

// Run op
workspace.RunOperatorOnce(op)

// Collect Output
print("Y:\n", workspace.FetchBlob("Y"))

```

**Result**

```

X:
 [[[ 5 12 10  1]
  [ 4 16  2 15]
  [ 5 11 12 15]
  [15  4 17 19]]

 [[16  5  5 13]
  [17  2  1 17]
  [18  3 19  5]
  [14 16 10 16]]]
Y:
 [[12. 16. 15. 19.]
 [16. 17. 19. 16.]]

```

</details>

    )DOC")
    .Input(
        0,
        "X",
        "A tensor of dimensions $B x M x N$ to compute rowwise-max. Here, $B$ is batch size, and $M$ and $N$ are the number of rows and columns of each element of the batch, respectively.")
    .Output(
        0,
        "Y",
        "The output tensor of shape $B x M$, where each row represents the row-wise maximums for that element of the input batch.");

OPERATOR_SCHEMA(RowwiseMaxGradient).NumInputs(3).NumOutputs(1);
class GetRowwiseMaxGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "RowwiseMaxGradient",
        "",
        vector<string>{I(0), O(0), GO(0)},
        vector<string>{GI(0)});
  }
};
REGISTER_GRADIENT(RowwiseMax, GetRowwiseMaxGradient);

OPERATOR_SCHEMA(ColwiseMaxGradient);

OPERATOR_SCHEMA(ColwiseMax)
    .NumInputs(1)
    .NumOutputs(1)
    .SetDoc(R"DOC(
Compute column-wise max reduction of the input tensor. This op takes one input, $X$, of shape $BxMxN$, where $B$ is the batch size, $M$ is number of rows, and $N$ is number of columns. The output of this op, $Y$, is a matrix of shape $BxN$, with one row for each element of the batch, and the same number of columns as the input tensor.

Github Links:
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.h
- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/reduction_ops.cc

<details>

<summary> <b>Example</b> </summary>

**Code**

```
workspace.ResetWorkspace()

op = core.CreateOperator(
    "ColwiseMax",
    ["X"],
    ["Y"]
)

// Create X, simulating a batch of 2, 4x4 matricies
X = np.random.randint(0,high=20,size=(2,4,4))
print("X:\n",X)

// Feed X into workspace
workspace.FeedBlob("X", X.astype(np.float32))

// Run op
workspace.RunOperatorOnce(op)

// Collect Output
print("Y:\n", workspace.FetchBlob("Y"))

```

**Result**

```

X:
 [[[17 15  2  6]
  [ 8 12  6  0]
  [ 6  9  7  3]
  [ 4 13 16 13]]

 [[ 0  3  4 12]
  [18  1 17 12]
  [ 7 17 13 14]
  [12 17  2  1]]]
Y:
 [[17. 15. 16. 13.]
 [18. 17. 17. 14.]]

```

</details>

    )DOC")
    .Input(
        0,
        "X",
        "A tensor of dimensions $B x M x N$ to compute columnwise-max. Here, $B$ is batch size, and $M$ and $N$ are the number of rows and columns of each element of the batch, respectively.")
    .Output(
        0,
        "Y",
        "The output tensor of shape $B x N$, where each row represents the column-wise maximums for that element of the input batch.");

OPERATOR_SCHEMA(ColumnMaxGradient).NumInputs(3).NumOutputs(1);
class GetColwiseMaxGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    return SingleGradientDef(
        "ColwiseMaxGradient",
        "",
        vector<string>{I(0), O(0), GO(0)},
        vector<string>{GI(0)});
  }
};
REGISTER_GRADIENT(ColwiseMax, GetColwiseMaxGradient);

template <typename T, class Context>
bool SumElementsGradientOp<T, Context>::RunOnDevice()
// TODO: T21635077 fix float-divide-by-zero undefined behavior
#if defined(__has_feature)
#if __has_feature(__address_sanitizer__)
    __attribute__((__no_sanitize__("float-divide-by-zero")))
#endif
#endif
{
  auto& X = Input(0);
  Tensor sum_grad(Input(1), CPU);

  auto* dX = Output(0, X.sizes(), at::dtype<T>());
  DCHECK_EQ(sum_grad.numel(), 1);
  math::Set<T, Context>(
      dX->numel(),
      static_cast<T>(
          sum_grad.template data<T>()[0] * (average_ ? 1.0 / X.numel() : 1)),
      dX->template mutable_data<T>(),
      &context_);
  return true;
}

template <typename T, class Context, bool ROWWISE>
bool MaxReductionGradientOp<T, Context, ROWWISE>::RunOnDevice() {
  auto& X = Input(0);
  auto& Y = Input(1);
  auto& dY = Input(2);

  auto* dX = Output(0, X.sizes(), at::dtype<T>());

  CAFFE_ENFORCE_EQ(X.dim(), 3);

  const int batch_size = X.dim32(0);
  const int M = X.dim32(1);
  const int N = X.dim32(2);

  const T* Xdata = X.template data<T>();
  const T* Ydata = Y.template data<T>();
  const T* dYdata = dY.template data<T>();
  T* dXdata = dX->template mutable_data<T>();

  const int input_size = M * N;
  for (int i = 0; i < batch_size; ++i) {
    const T* Xdata_i = Xdata + i * input_size;
    T* dXdata_i = dXdata + i * input_size;
    if (ROWWISE) {
      const T* Ydata_i = Ydata + i * M;
      const T* dYdata_i = dYdata + i * M;
      for (int m = 0; m < M; ++m) {
        const T* Xdata_m = Xdata_i + m * N;
        T* dXdata_m = dXdata_i + m * N;
        for (int n = 0; n < N; ++n) {
          if (Xdata_m[n] == Ydata_i[m]) {
            dXdata_m[n] = dYdata_i[m];
          } else {
            dXdata_m[n] = static_cast<T>(0);
          }
        }
      }
    } else {
      const T* Ydata_i = Ydata + i * N;
      const T* dYdata_i = dYdata + i * N;
      for (int n = 0; n < N; ++n) {
        for (int m = 0; m < M; ++m) {
          const T* Xdata_m = Xdata_i + m * N;
          T* dXdata_m = dXdata_i + m * N;
          if (Xdata_m[n] == Ydata_i[n]) {
            dXdata_m[n] = dYdata_i[n];
          } else {
            dXdata_m[n] = static_cast<T>(0);
          }
        }
      }
    }
  }

  return true;
}

} // namespace caffe2