Separate reduce functions from math (#16929)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16929 Separate CPU reduce functions from math i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D13999469 fbshipit-source-id: bd628b15a6e3c1f04cc62aefffb0110690e1c0d1
2025-12-06 12:20:52 +01:00 · 2019-02-13 17:47:49 -08:00 · 2019-02-13 17:47:49 -08:00 · 3a34f443c5
commit 3a34f443c5
parent 9b7f3da74b
18 changed files with 1002 additions and 1460 deletions
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@ -14,7 +14,7 @@ OPERATOR_SCHEMA(ATen);
 namespace math {
 template <>
 void Set<at::Half, CPUContext>(
-    const size_t /*N*/,
+    const int /*N*/,
    const at::Half h,
    at::Half* v,
    CPUContext* c) {
--- a/caffe2/operators/elementwise_add_op.h
+++ b/caffe2/operators/elementwise_add_op.h
@ -47,15 +47,14 @@ struct AddFunctor {
    const std::vector<int> C_dims =
        elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
            A_dims, B_dims);
-    std::vector<int> A_axes;
-    std::vector<int> B_axes;
-    elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
-        A_dims, B_dims, &A_axes, &B_axes);
+    std::vector<int> A_back_dims;
+    std::vector<int> B_back_dims;
+    elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
+        A_dims, B_dims, &A_back_dims, &B_back_dims);
    math::ReduceSum(
        C_dims.size(),
        C_dims.data(),
-        A_axes.size(),
-        A_axes.data(),
+        A_back_dims.data(),
        TGrad(1),
        dC,
        dA,
@ -63,8 +62,7 @@ struct AddFunctor {
    math::ReduceSum(
        C_dims.size(),
        C_dims.data(),
-        B_axes.size(),
-        B_axes.data(),
+        B_back_dims.data(),
        TGrad(1),
        dC,
        dB,
--- a/caffe2/operators/elementwise_ops_utils.cc
+++ b/caffe2/operators/elementwise_ops_utils.cc
@ -108,5 +108,17 @@ void ComputeBinaryBroadcastBackwardAxes(
  std::reverse(B_axes->begin(), B_axes->end());
 }

+void ComputeBinaryBroadcastBackwardDims(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    std::vector<int>* A_back_dims,
+    std::vector<int>* B_back_dims) {
+  const int ndim = std::max(A_dims.size(), B_dims.size());
+  A_back_dims->assign(ndim, 1);
+  B_back_dims->assign(ndim, 1);
+  std::copy(A_dims.crbegin(), A_dims.crend(), A_back_dims->rbegin());
+  std::copy(B_dims.crbegin(), B_dims.crend(), B_back_dims->rbegin());
+}
+
 } // namespace elementwise_ops_utils
 } // namespace caffe2
--- a/caffe2/operators/elementwise_ops_utils.h
+++ b/caffe2/operators/elementwise_ops_utils.h
@ -23,6 +23,12 @@ CAFFE2_API void ComputeBinaryBroadcastBackwardAxes(
    std::vector<int>* A_axes,
    std::vector<int>* B_axes);

+CAFFE2_API void ComputeBinaryBroadcastBackwardDims(
+    const std::vector<int>& A_dims,
+    const std::vector<int>& B_dims,
+    std::vector<int>* A_back_dims,
+    std::vector<int>* B_back_dims);
+
 } // namespace elementwise_ops_utils
 } // namespace caffe2

--- a/caffe2/operators/elementwise_sub_op.h
+++ b/caffe2/operators/elementwise_sub_op.h
@ -47,15 +47,14 @@ struct SubFunctor {
    const std::vector<int> C_dims =
        elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
            A_dims, B_dims);
-    std::vector<int> A_axes;
-    std::vector<int> B_axes;
-    elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
-        A_dims, B_dims, &A_axes, &B_axes);
+    std::vector<int> A_back_dims;
+    std::vector<int> B_back_dims;
+    elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
+        A_dims, B_dims, &A_back_dims, &B_back_dims);
    math::ReduceSum(
        C_dims.size(),
        C_dims.data(),
-        A_axes.size(),
-        A_axes.data(),
+        A_back_dims.data(),
        TGrad(1),
        dC,
        dA,
@ -63,8 +62,7 @@ struct SubFunctor {
    math::ReduceSum(
        C_dims.size(),
        C_dims.data(),
-        B_axes.size(),
-        B_axes.data(),
+        B_back_dims.data(),
        TGrad(-1),
        dC,
        dB,
--- a/caffe2/operators/expand_op.h
+++ b/caffe2/operators/expand_op.h
@ -94,11 +94,14 @@ class ExpandGradientOp final : public Operator<Context> {
        axes.push_back(i);
      }
    }
+    std::vector<int> X_dims = dY_dims;
+    for (const int axis : axes) {
+      X_dims[axis] = 1;
+    }
    math::ReduceSum<T, Context>(
        dY_dims.size(),
        dY_dims.data(),
-        axes.size(),
-        axes.data(),
+        X_dims.data(),
        T(1),
        dY.template data<T>(),
        dX->template mutable_data<T>(),
--- a/caffe2/operators/pool_op.cc
+++ b/caffe2/operators/pool_op.cc
@ -572,10 +572,10 @@ bool AveragePoolFunctor<CPUContext>::
        const float* X,
        float* Y,
        CPUContext* context) const {
-  const std::array<int, 2> dims = {N * C, HxW};
-  const int axis = 1;
+  const std::array<int, 2> X_dims = {N * C, HxW};
+  const std::array<int, 2> Y_dims = {N * C, 1};
  math::ReduceMean<float, CPUContext>(
-      2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+      2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
  return true;
 }

@ -720,10 +720,10 @@ bool MaxPoolFunctor<CPUContext>::
        const float* X,
        float* Y,
        CPUContext* context) const {
-  const std::array<int, 2> dims = {N * C, HxW};
-  const int axis = 1;
+  const std::array<int, 2> X_dims = {N * C, HxW};
+  const std::array<int, 2> Y_dims = {N * C, 1};
  math::ReduceMax<float, CPUContext>(
-      2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+      2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
  return true;
 }

--- a/caffe2/operators/pool_op.cu
+++ b/caffe2/operators/pool_op.cu
@ -698,10 +698,10 @@ bool AveragePoolFunctor<CUDAContext>::
        const float* X,
        float* Y,
        CUDAContext* context) const {
-  const std::array<int, 2> dims = {N * C, HxW};
-  const int axis = 1;
+  const std::array<int, 2> X_dims = {N * C, HxW};
+  const std::array<int, 2> Y_dims = {N * C, 1};
  math::ReduceMean<float, CUDAContext>(
-      2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+      2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
  return true;
 }

@ -1756,10 +1756,10 @@ bool MaxPoolFunctor<CUDAContext>::
        const float* X,
        float* Y,
        CUDAContext* context) const {
-  const std::array<int, 2> dims = {N * C, HxW};
-  const int axis = 1;
+  const std::array<int, 2> X_dims = {N * C, HxW};
+  const std::array<int, 2> Y_dims = {N * C, 1};
  math::ReduceMax<float, CUDAContext>(
-      2, dims.data(), 1, &axis, 1.0f, X, Y, context);
+      2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
  return true;
 }

@ -1773,10 +1773,10 @@ bool MaxPoolFunctor<CUDAContext>::
        const float* X,
        float* Y,
        CUDAContext* context) const {
-  const std::array<int, 3> dims = {N, HxW, C};
-  const int axis = 1;
+  const std::array<int, 3> X_dims = {N, HxW, C};
+  const std::array<int, 3> Y_dims = {N, 1, C};
  math::ReduceMax<float, CUDAContext>(
-      3, dims.data(), 1, &axis, 1.0f, X, Y, context);
+      3, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
  return true;
 }

--- a/caffe2/operators/reduce_ops.h
+++ b/caffe2/operators/reduce_ops.h
@ -29,13 +29,13 @@ class ReduceOp final : public Operator<Context> {
  template <typename T>
  bool DoRunWithType() {
    const auto& X = Input(0);
-
    const int ndim = X.dim();
+    const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
    if (axes_.empty()) {
      axes_.resize(ndim);
      std::iota(axes_.begin(), axes_.end(), 0);
    } else {
-      for (auto& axis: axes_) {
+      for (auto& axis : axes_) {
        axis = X.canonical_axis_index(axis);
      }
      std::sort(axes_.begin(), axes_.end());
@ -45,24 +45,29 @@ class ReduceOp final : public Operator<Context> {
          ndim,
          "Axes ids must be smaller than the dimensions of input.");
    }
-    const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
-    std::vector<int64_t> Y_dims;
-    Y_dims.reserve(ndim);
+    std::vector<int64_t> output_dims;
+    output_dims.reserve(ndim);
    std::size_t cur_axis = 0;
    for (int i = 0; i < ndim; ++i) {
      if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
        if (keep_dims_) {
-          Y_dims.push_back(1);
+          output_dims.push_back(1);
        }
        ++cur_axis;
      } else {
-        Y_dims.push_back(X_dims[i]);
+        output_dims.push_back(X_dims[i]);
      }
    }
-    auto* Y = Output(0, Y_dims, at::dtype<T>());
+    auto* Y = Output(0, output_dims, at::dtype<T>());
+
+    std::vector<int> Y_dims = X_dims;
+    for (const int axis : axes_) {
+      Y_dims[axis] = 1;
+    }
+
    return reducer_.template Forward<T>(
        X_dims,
-        axes_,
+        Y_dims,
        X.template data<T>(),
        Y->template mutable_data<T>(),
        &context_);
@ -71,7 +76,7 @@ class ReduceOp final : public Operator<Context> {
 private:
  std::vector<int> axes_;
  const int keep_dims_;
-  Reducer reducer_{};
+  const Reducer reducer_{};
 };

 template <typename InputTypes, class Context, class Reducer>
@ -98,7 +103,7 @@ class ReduceGradientOp final : public Operator<Context> {
      axes_.resize(ndim);
      std::iota(axes_.begin(), axes_.end(), 0);
    } else {
-      for (auto& axis: axes_) {
+      for (auto& axis : axes_) {
        axis = X.canonical_axis_index(axis);
      }
      std::sort(axes_.begin(), axes_.end());
@ -126,23 +131,22 @@ class ReduceGradientOp final : public Operator<Context> {

 private:
  std::vector<int> axes_;
-  Reducer reducer_{};
+  const Reducer reducer_{};
 };

 template <class Context>
 struct MinReducer {
  template <typename T>
  bool Forward(
-      const std::vector<int>& dims,
-      const std::vector<int>& axes,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
      const T* X_data,
      T* Y_data,
      Context* context) const {
    math::ReduceMin<T, Context>(
-        dims.size(),
-        dims.data(),
-        axes.size(),
-        axes.data(),
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.data(),
        T(1),
        X_data,
        Y_data,
@ -165,16 +169,15 @@ template <class Context>
 struct MaxReducer {
  template <typename T>
  bool Forward(
-      const std::vector<int>& dims,
-      const std::vector<int>& axes,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
      const T* X_data,
      T* Y_data,
      Context* context) const {
    math::ReduceMax<T, Context>(
-        dims.size(),
-        dims.data(),
-        axes.size(),
-        axes.data(),
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.data(),
        T(1),
        X_data,
        Y_data,
@ -197,16 +200,15 @@ template <class Context>
 struct SumReducer {
  template <typename T>
  bool Forward(
-      const std::vector<int>& dims,
-      const std::vector<int>& axes,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
      const T* X_data,
      T* Y_data,
      Context* context) const {
    math::ReduceSum<T, Context>(
-        dims.size(),
-        dims.data(),
-        axes.size(),
-        axes.data(),
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.data(),
        T(1),
        X_data,
        Y_data,
@ -240,16 +242,15 @@ template <class Context>
 struct MeanReducer {
  template <typename T>
  bool Forward(
-      const std::vector<int>& dims,
-      const std::vector<int>& axes,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
      const T* X_data,
      T* Y_data,
      Context* context) const {
    math::ReduceMean<T, Context>(
-        dims.size(),
-        dims.data(),
-        axes.size(),
-        axes.data(),
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.data(),
        T(1),
        X_data,
        Y_data,
@ -287,16 +288,15 @@ template <class Context>
 struct L1Reducer {
  template <typename T>
  bool Forward(
-      const std::vector<int>& dims,
-      const std::vector<int>& axes,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
      const T* X_data,
      T* Y_data,
      Context* context) const {
    math::ReduceL1<T, Context>(
-        dims.size(),
-        dims.data(),
-        axes.size(),
-        axes.data(),
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.data(),
        T(1),
        X_data,
        Y_data,
@ -319,16 +319,15 @@ template <class Context>
 struct L2Reducer {
  template <typename T>
  bool Forward(
-      const std::vector<int>& dims,
-      const std::vector<int>& axes,
+      const std::vector<int>& X_dims,
+      const std::vector<int>& Y_dims,
      const T* X_data,
      T* Y_data,
      Context* context) const {
    math::ReduceL2<T, Context>(
-        dims.size(),
-        dims.data(),
-        axes.size(),
-        axes.data(),
+        X_dims.size(),
+        X_dims.data(),
+        Y_dims.data(),
        T(1),
        X_data,
        Y_data,
--- a/caffe2/utils/math.h
+++ b/caffe2/utils/math.h
@ -31,34 +31,34 @@ class CAFFE2_API DefaultEngine {};

 namespace math {

-#define C10_DECLARE_COMPARE_OP(Comp)                                         \
-  template <typename T, class Context, bool kBroadcast1st = false>           \
-  void Rowwise##Comp(                                                        \
-      const int rows,                                                        \
-      const int cols,                                                        \
-      const T* A,                                                            \
-      const T* B,                                                            \
-      bool* C,                                                               \
-      Context* context);                                                     \
-                                                                             \
-  template <typename T, class Context, bool kBroadcast1st = false>           \
-  void Colwise##Comp(                                                        \
-      const int rows,                                                        \
-      const int cols,                                                        \
-      const T* A,                                                            \
-      const T* B,                                                            \
-      bool* C,                                                               \
-      Context* context);                                                     \
-                                                                             \
-  template <typename T, class Context>                                       \
-  void Comp(                                                                 \
-      const int A_ndim,                                                      \
-      const int* A_dims,                                                     \
-      const int B_ndim,                                                      \
-      const int* B_dims,                                                     \
-      const T* A,                                                            \
-      const T* B,                                                            \
-      bool* C,                                                               \
+#define C10_DECLARE_COMPARE_OP(Comp)                               \
+  template <typename T, class Context, bool kBroadcast1st = false> \
+  void Rowwise##Comp(                                              \
+      const int rows,                                              \
+      const int cols,                                              \
+      const T* A,                                                  \
+      const T* B,                                                  \
+      bool* C,                                                     \
+      Context* context);                                           \
+                                                                   \
+  template <typename T, class Context, bool kBroadcast1st = false> \
+  void Colwise##Comp(                                              \
+      const int rows,                                              \
+      const int cols,                                              \
+      const T* A,                                                  \
+      const T* B,                                                  \
+      bool* C,                                                     \
+      Context* context);                                           \
+                                                                   \
+  template <typename T, class Context>                             \
+  void Comp(                                                       \
+      const int A_ndim,                                            \
+      const int* A_dims,                                           \
+      const int B_ndim,                                            \
+      const int* B_dims,                                           \
+      const T* A,                                                  \
+      const T* B,                                                  \
+      bool* C,                                                     \
      Context* context);

 C10_DECLARE_COMPARE_OP(EQ)
@ -115,80 +115,6 @@ C10_DECLARE_BINARY_OP(BitwiseXor)

 #undef C10_DECLARE_BINARY_OP

-template <typename T, class Context>
-CAFFE2_API void
-ReduceMin(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void
-ReduceMax(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceMin(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T alpha,
-    const T* X,
-    T* Y,
-    Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceMax(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T alpha,
-    const T* X,
-    T* Y,
-    Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceSum(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T alpha,
-    const T* X,
-    T* Y,
-    Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceMean(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T alpha,
-    const T* X,
-    T* Y,
-    Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceL1(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T alpha,
-    const T* X,
-    T* Y,
-    Context* context);
-
-template <typename T, class Context>
-CAFFE2_API void ReduceL2(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
-    const T alpha,
-    const T* X,
-    T* Y,
-    Context* context);
-
 // Broadcasts X with X_dims to Y with Y_dims.
 template <typename T, class Context>
 CAFFE2_API void Broadcast(
@ -337,9 +263,6 @@ CAFFE2_API void Gemv(
    Context* context,
    TensorProto::DataType math_type = TensorProto_DataType_FLOAT);

-template <typename T, class Context>
-CAFFE2_API void Set(const size_t N, const T alpha, T* X, Context* context);
-
 template <typename T, class Context>
 CAFFE2_API void
 RandUniform(const size_t n, const T a, const T b, T* r, Context* context);
@ -409,25 +332,6 @@ CAFFE2_API void Select(
    T* y,
    Context* context);

-template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Scale(
-    const int N,
-    const TAlpha alpha,
-    const TData* x,
-    TData* y,
-    Context* context);
-
-// Different from the Scale function above, if alpha is passed in
-// as a pointer, we will assume that it lives on the Context device,
-// for example on GPU.
-template <typename TAlpha, typename TData, class Context>
-CAFFE2_API void Scale(
-    const int N,
-    const TAlpha* alpha,
-    const TData* x,
-    TData* y,
-    Context* context);
-
 template <typename T, class Context>
 CAFFE2_API void
 Axpy(const int N, const float alpha, const T* x, T* y, Context* context);
--- a/caffe2/utils/math/elementwise.cc
+++ b/caffe2/utils/math/elementwise.cc
@ -3,6 +3,10 @@
 #include <algorithm>
 #include <functional>

+#ifdef CAFFE2_USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#endif // CAFFE2_USE_ACCELERATE
+
 #ifdef CAFFE2_USE_MKL
 #include <mkl.h>
 #endif // CAFFE2_USE_MKL
@ -73,25 +77,25 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Erf, vsErf)
 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Erf, vdErf)
 #undef DELEGATE_SIMPLE_UNARY_FUNCTION

-#define DELEGATE_SINCOS_FUNCTION(T, MKLFunc)                            \
+#define DELEGATE_SINCOS(T, MKLFunc)                                     \
  template <>                                                           \
  C10_EXPORT void SinCos<T, CPUContext>(                                \
      const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \
    MKLFunc(N, X, S, C);                                                \
  }
-DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
-DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
-#undef DELEGATE_SINCOS_FUNCTION
+DELEGATE_SINCOS(float, vsSinCos)
+DELEGATE_SINCOS(double, vdSinCos)
+#undef DELEGATE_SINCOS

-#define DELEGATE_POWX_FUNCTION(T, MKLFunc)                                   \
+#define DELEGATE_POWX(T, MKLFunc)                                            \
  template <>                                                                \
  C10_EXPORT void Powx<T, CPUContext>(                                       \
      const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \
    MKLFunc(N, A, b, Y);                                                     \
  }
-DELEGATE_POWX_FUNCTION(float, vsPowx)
-DELEGATE_POWX_FUNCTION(double, vdPowx)
-#undef DELEGATE_POWX_FUNCTION
+DELEGATE_POWX(float, vsPowx)
+DELEGATE_POWX(double, vdPowx)
+#undef DELEGATE_POWX

 #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc)                     \
  template <>                                                                 \
@ -228,6 +232,155 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /)

 #endif // CAFFE2_USE_MKL

+////////////////////////////////////////////////////////////////////////////////
+// BLAS alternatives.
+// Depending on whether we have specified an external BLAS library or not, we
+// will delegate the Caffe math functions that are BLAS-related to either the
+// CBLAS call or the Eigen implementation.
+////////////////////////////////////////////////////////////////////////////////
+#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
+
+#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData)                               \
+  template <>                                                                 \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                           \
+      const int N,                                                            \
+      const TAlpha alpha,                                                     \
+      const TData* X,                                                         \
+      TData* Y,                                                               \
+      CPUContext* /* context */) {                                            \
+    if (X == Y) {                                                             \
+      EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha);          \
+    } else {                                                                  \
+      EigenVectorArrayMap<TData>(Y, N) =                                      \
+          ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha);  \
+    }                                                                         \
+  }                                                                           \
+  template <>                                                                 \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                           \
+      const int N,                                                            \
+      const TAlpha* alpha,                                                    \
+      const TData* X,                                                         \
+      TData* Y,                                                               \
+      CPUContext* /* context */) {                                            \
+    if (X == Y) {                                                             \
+      EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha);         \
+    } else {                                                                  \
+      EigenVectorArrayMap<TData>(Y, N) =                                      \
+          ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+    }                                                                         \
+  }
+CAFFE2_SPECIALIZED_SCALE(float, float)
+CAFFE2_SPECIALIZED_SCALE(double, double)
+CAFFE2_SPECIALIZED_SCALE(float, double)
+#undef CAFFE2_SPECIALIZED_SCALE
+
+#else // CAFFE2_USE_EIGEN_FOR_BLAS
+
+#ifdef CAFFE2_USE_MKL
+
+#define DELEGATE_SCALE(TAlpha, TData, MKLFunc1, MKLFunc2)            \
+  template <>                                                        \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                  \
+      const int N,                                                   \
+      const TAlpha alpha,                                            \
+      const TData* X,                                                \
+      TData* Y,                                                      \
+      CPUContext* /* context */) {                                   \
+    if (Y == X) {                                                    \
+      MKLFunc1(N, static_cast<TData>(alpha), Y, 1);                  \
+    } else {                                                         \
+      MKLFunc2(N, static_cast<TData>(alpha), X, 1, TData(0), Y, 1);  \
+    }                                                                \
+  }                                                                  \
+  template <>                                                        \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                  \
+      const int N,                                                   \
+      const TAlpha* alpha,                                           \
+      const TData* X,                                                \
+      TData* Y,                                                      \
+      CPUContext* /* context */) {                                   \
+    if (Y == X) {                                                    \
+      MKLFunc1(N, static_cast<TData>(*alpha), Y, 1);                 \
+    } else {                                                         \
+      MKLFunc2(N, static_cast<TData>(*alpha), X, 1, TData(0), Y, 1); \
+    }                                                                \
+  }
+DELEGATE_SCALE(float, float, cblas_sscal, cblas_saxpby)
+DELEGATE_SCALE(double, double, cblas_dscal, cblas_daxpby)
+DELEGATE_SCALE(float, double, cblas_dscal, cblas_daxpby)
+#undef DELEGATE_SCALE
+
+#else // CAFFE2_USE_MKL
+
+#define DELEGATE_SCALE(TAlpha, TData, BLASFunc)                               \
+  template <>                                                                 \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                           \
+      const int N,                                                            \
+      const TAlpha alpha,                                                     \
+      const TData* X,                                                         \
+      TData* Y,                                                               \
+      CPUContext* /* context */) {                                            \
+    if (Y == X) {                                                             \
+      BLASFunc(N, static_cast<TData>(alpha), Y, 1);                           \
+    } else {                                                                  \
+      EigenVectorArrayMap<TData>(Y, N) =                                      \
+          ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha);  \
+    }                                                                         \
+  }                                                                           \
+  template <>                                                                 \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                           \
+      const int N,                                                            \
+      const TAlpha* alpha,                                                    \
+      const TData* X,                                                         \
+      TData* Y,                                                               \
+      CPUContext* /* context */) {                                            \
+    if (Y == X) {                                                             \
+      BLASFunc(N, static_cast<TData>(*alpha), Y, 1);                          \
+    } else {                                                                  \
+      EigenVectorArrayMap<TData>(Y, N) =                                      \
+          ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+    }                                                                         \
+  }
+DELEGATE_SCALE(float, float, cblas_sscal)
+DELEGATE_SCALE(double, double, cblas_dscal)
+DELEGATE_SCALE(float, double, cblas_dscal)
+#undef DELEGATE_SCALE
+
+#endif // CAFFE2_USE_MKL
+
+#endif // CAFFE2_USE_EIGEN_FOR_BLAS
+
+////////////////////////////////////////////////////////////////////////////////
+// Common math functions being used in Caffe that do not have a BLAS or MKL
+// equivalent. For all these functions, we will simply implement them either via
+// Eigen or via custom code.
+////////////////////////////////////////////////////////////////////////////////
+
+#define CAFFE2_SPECIALIZED_SET(T)                                    \
+  template <>                                                        \
+  C10_EXPORT void Set<T, CPUContext>(                                \
+      const int N, const T alpha, T* Y, CPUContext* /* context */) { \
+    if (N == 0) {                                                    \
+      return;                                                        \
+    }                                                                \
+    if (alpha == T(0)) {                                             \
+      std::memset(Y, 0, N * sizeof(T));                              \
+    } else {                                                         \
+      EigenVectorArrayMap<T>(Y, N).setConstant(alpha);               \
+    }                                                                \
+  }
+CAFFE2_SPECIALIZED_SET(float)
+CAFFE2_SPECIALIZED_SET(double)
+CAFFE2_SPECIALIZED_SET(int)
+CAFFE2_SPECIALIZED_SET(std::int8_t)
+CAFFE2_SPECIALIZED_SET(std::int16_t)
+CAFFE2_SPECIALIZED_SET(std::int64_t)
+CAFFE2_SPECIALIZED_SET(bool)
+CAFFE2_SPECIALIZED_SET(char)
+CAFFE2_SPECIALIZED_SET(std::uint8_t)
+CAFFE2_SPECIALIZED_SET(std::uint16_t)
+#undef CAFFE2_SPECIALIZED_SET
+
 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc)        \
  template <>                                                     \
  C10_EXPORT void Func<T, CPUContext>(                            \
@ -262,6 +415,39 @@ CAFFE2_SPECIALIZED_NEG(float)
 CAFFE2_SPECIALIZED_NEG(double)
 #undef CAFFE2_SPECIALIZED_NEG

+#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData)                               \
+  template <>                                                                 \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                           \
+      const int N,                                                            \
+      const TAlpha alpha,                                                     \
+      const TData* X,                                                         \
+      TData* Y,                                                               \
+      CPUContext* /* context */) {                                            \
+    if (X == Y) {                                                             \
+      EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha);          \
+    } else {                                                                  \
+      EigenVectorArrayMap<TData>(Y, N) =                                      \
+          ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha);  \
+    }                                                                         \
+  }                                                                           \
+  template <>                                                                 \
+  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                           \
+      const int N,                                                            \
+      const TAlpha* alpha,                                                    \
+      const TData* X,                                                         \
+      TData* Y,                                                               \
+      CPUContext* /* context */) {                                            \
+    if (X == Y) {                                                             \
+      EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha);         \
+    } else {                                                                  \
+      EigenVectorArrayMap<TData>(Y, N) =                                      \
+          ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
+    }                                                                         \
+  }
+CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
+CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
+#undef CAFFE2_SPECIALIZED_SCALE
+
 #define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp)   \
  template <>                                                                 \
  C10_EXPORT void Func<T, CPUContext>(                                        \
@ -286,8 +472,12 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Div, /)
    EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N).EigenFunc(    \
        ConstEigenVectorArrayMap<T>(B, N));                                   \
  }
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Min, min)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Min, min)
 DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min)
 DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Max, max)
+DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Max, max)
 DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max)
 DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max)
 #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION
--- a/caffe2/utils/math/elementwise.h
+++ b/caffe2/utils/math/elementwise.h
@ -56,6 +56,19 @@ CAFFE2_API void Inv(int N, const T* X, T* Y, Context* context);
 template <typename T, class Context>
 CAFFE2_API void Erf(int N, const T* X, T* Y, Context* context);

+template <typename T, class Context>
+CAFFE2_API void Set(int N, T alpha, T* X, Context* context);
+
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void
+Scale(int N, TAlpha alpha, const TData* X, TData* Y, Context* context);
+
+// Different from the Scale function above, if alpha is passed in as a pointer,
+// we will assume that it lives on the Context device, for example on GPU.
+template <typename TAlpha, typename TData, class Context>
+CAFFE2_API void
+Scale(int N, const TAlpha* alpha, const TData* X, TData* Y, Context* context);
+
 template <typename T, class Context>
 CAFFE2_API void Add(int N, const T* A, const T* B, T* C, Context* context);
 template <typename T, class Context>
--- a/caffe2/utils/math/reduce.cc
+++ b/caffe2/utils/math/reduce.cc
@ -6,8 +6,17 @@
 #include <numeric>
 #include <vector>

+#ifdef CAFFE2_USE_ACCELERATE
+#include <Accelerate/Accelerate.h>
+#endif // CAFFE2_USE_ACCELERATE
+
+#ifdef CAFFE2_USE_MKL
+#include <mkl.h>
+#endif // CAFFE2_USE_MKL
+
 #include "caffe2/core/context.h"
 #include "caffe2/utils/eigen_utils.h"
+#include "caffe2/utils/math/elementwise.h"
 #include "caffe2/utils/math/utils.h"

 namespace caffe2 {
@ -15,9 +24,385 @@ namespace math {

 namespace {

+#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc)                    \
+  template <typename T>                                                      \
+  void Rowwise##Func(                                                        \
+      const int rows,                                                        \
+      const int cols,                                                        \
+      const T alpha,                                                         \
+      const T* X,                                                            \
+      T* Y,                                                                  \
+      CPUContext* /* context */) {                                           \
+    EigenVectorMap<T>(Y, rows) =                                             \
+        ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenFunc() * alpha; \
+  }
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
+#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
+
+#ifndef CAFFE2_USE_EIGEN_FOR_BLAS
+
+#define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \
+  template <>                                               \
+  void Rowwise##Func(                                       \
+      const int rows,                                       \
+      const int cols,                                       \
+      const T alpha,                                        \
+      const T* X,                                           \
+      T* Y,                                                 \
+      CPUContext* /* context */) {                          \
+    for (int i = 0; i < rows; ++i) {                        \
+      Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha;       \
+    }                                                       \
+  }
+DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL1, cblas_sasum)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL1, cblas_dasum)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL2, cblas_snrm2)
+DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL2, cblas_dnrm2)
+#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
+
+#endif // CAFFE2_USE_EIGEN_FOR_BLAS
+
+#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc)          \
+  template <typename T>                                           \
+  void Colwise##Func(                                             \
+      const int rows,                                             \
+      const int cols,                                             \
+      const T alpha,                                              \
+      const T* X,                                                 \
+      T* Y,                                                       \
+      CPUContext* context) {                                      \
+    std::memcpy(Y, X, sizeof(T) * cols);                          \
+    for (int i = 1; i < rows; ++i) {                              \
+      MathFunc<T, CPUContext>(cols, Y, X + i * cols, Y, context); \
+    }                                                             \
+    Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);          \
+  }
+DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min)
+DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max)
+DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, Add)
+#undef DELEGATE_COLWISE_REDUCE_FUNCTION
+
 template <typename T>
-C10_EXPORT void
-RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
+void ColwiseReduceMean(
+    const int rows,
+    const int cols,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ColwiseReduceSum<T>(rows, cols, alpha / static_cast<T>(rows), X, Y, context);
+}
+
+template <typename T>
+void ColwiseReduceL1(
+    const int rows,
+    const int cols,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ConstEigenArrayMap<T> X_arr(X, cols, rows);
+  EigenVectorArrayMap<T> Y_arr(Y, cols);
+  Y_arr = X_arr.col(0).abs();
+  for (int i = 1; i < rows; ++i) {
+    Y_arr += X_arr.col(i).abs();
+  }
+  Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ColwiseReduceL2(
+    const int rows,
+    const int cols,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* /* context */) {
+  ConstEigenArrayMap<T> X_arr(X, cols, rows);
+  EigenVectorArrayMap<T> Y_arr(Y, cols);
+  Y_arr = X_arr.col(0).square();
+  for (int i = 1; i < rows; ++i) {
+    Y_arr += X_arr.col(i).square();
+  }
+  Y_arr = Y_arr.sqrt() * alpha;
+}
+
+template <typename T>
+void BothEndsReduceMin(
+    const int M,
+    const int N,
+    const int K,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  EigenVectorArrayMap<T> Y_arr(Y, N);
+  Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().minCoeff();
+  for (int i = 1; i < M; ++i) {
+    ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
+    for (int j = 0; j < N; ++j) {
+      Y[j] = std::min(Y[j], X_arr.col(j).minCoeff());
+    }
+  }
+  Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceMax(
+    const int M,
+    const int N,
+    const int K,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  EigenVectorArrayMap<T> Y_arr(Y, N);
+  Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().maxCoeff();
+  for (int i = 1; i < M; ++i) {
+    ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
+    for (int j = 0; j < N; ++j) {
+      Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff());
+    }
+  }
+  Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceSum(
+    const int M,
+    const int N,
+    const int K,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  EigenVectorArrayMap<T> Y_arr(Y, N);
+  Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
+  for (int i = 1; i < M; ++i) {
+    Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum();
+  }
+  Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceMean(
+    const int M,
+    const int N,
+    const int K,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  EigenVectorArrayMap<T> Y_arr(Y, N);
+  Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().mean();
+  for (int i = 1; i < M; ++i) {
+    Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().mean();
+  }
+  Scale<T, T, CPUContext>(N, alpha / static_cast<T>(M), Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceL1(
+    const int M,
+    const int N,
+    const int K,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  EigenVectorMap<T> Y_vec(Y, N);
+  Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().template lpNorm<1>();
+  for (int i = 1; i < M; ++i) {
+    Y_vec += ConstEigenMatrixMap<T>(X + i * N * K, K, N)
+                 .colwise()
+                 .template lpNorm<1>();
+  }
+  Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
+}
+
+template <typename T>
+void BothEndsReduceL2(
+    const int M,
+    const int N,
+    const int K,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* /* context */) {
+  EigenVectorMap<T> Y_vec(Y, N);
+  Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().squaredNorm();
+  for (int i = 1; i < M; ++i) {
+    Y_vec +=
+        ConstEigenMatrixMap<T>(X + i * N * K, K, N).colwise().squaredNorm();
+  }
+  Y_vec = Y_vec.cwiseSqrt() * alpha;
+}
+
+template <typename T, class Reducer>
+void ReduceTensorImpl(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const Reducer& reducer,
+    const T init,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  const int X_size =
+      std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+  Set<T, CPUContext>(Y_size, init, Y, context);
+  std::vector<int> index(ndim, 0);
+  for (int X_index = 0; X_index < X_size; ++X_index) {
+    const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
+    Y[Y_index] = reducer(Y[Y_index], X[X_index]);
+    utils::IncreaseIndexInDims(ndim, X_dims, index.data());
+  }
+}
+
+template <typename T>
+void ReduceMinImpl(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ReduceTensorImpl(
+      ndim,
+      X_dims,
+      Y_dims,
+      [](const T a, const T b) { return std::min(a, b); },
+      std::numeric_limits<T>::max(),
+      X,
+      Y,
+      context);
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+  Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ReduceMaxImpl(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ReduceTensorImpl(
+      ndim,
+      X_dims,
+      Y_dims,
+      [](const T a, const T b) { return std::max(a, b); },
+      std::numeric_limits<T>::lowest(),
+      X,
+      Y,
+      context);
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+  Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ReduceSumImpl(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+  Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ReduceMeanImpl(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
+  const int X_size =
+      std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+  Scale<T, T, CPUContext>(
+      Y_size,
+      alpha * static_cast<T>(Y_size) / static_cast<T>(X_size),
+      Y,
+      Y,
+      context);
+}
+
+template <typename T>
+void ReduceL1Impl(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ReduceTensorImpl(
+      ndim,
+      X_dims,
+      Y_dims,
+      [](const T a, const T b) { return a + std::abs(b); },
+      T(0),
+      X,
+      Y,
+      context);
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+  Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
+}
+
+template <typename T>
+void ReduceL2Impl(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    CPUContext* context) {
+  ReduceTensorImpl(
+      ndim,
+      X_dims,
+      Y_dims,
+      [](const T a, const T b) { return a + b * b; },
+      T(0),
+      X,
+      Y,
+      context);
+  const int Y_size =
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
+  EigenVectorArrayMap<T> Y_arr(Y, Y_size);
+  Y_arr = Y_arr.sqrt() * alpha;
+}
+
+template <typename T>
+void RowwiseMoments(
+    const int rows,
+    const int cols,
+    const T* X,
+    T* mean,
+    T* var) {
  ConstEigenArrayMap<T> X_arr(X, cols, rows);
  EigenVectorArrayMap<T> mean_arr(mean, rows);
  EigenVectorArrayMap<T> var_arr(var, rows);
@ -26,15 +411,19 @@ RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
 }

 template <typename T>
-C10_EXPORT void
-ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
-  std::memset(mean, 0, sizeof(T) * cols);
-  std::memset(var, 0, sizeof(T) * cols);
+void ColwiseMoments(
+    const int rows,
+    const int cols,
+    const T* X,
+    T* mean,
+    T* var) {
  ConstEigenArrayMap<T> X_arr(X, cols, rows);
  EigenVectorArrayMap<T> mean_arr(mean, cols);
  EigenVectorArrayMap<T> var_arr(var, cols);
  // Eigen rowwise reduction is about 10 times slower than this for-loop.
-  for (int i = 0; i < rows; ++i) {
+  mean_arr = X_arr.col(0);
+  var_arr = X_arr.col(0).square();
+  for (int i = 1; i < rows; ++i) {
    mean_arr += X_arr.col(i);
    var_arr += X_arr.col(i).square();
  }
@ -44,32 +433,30 @@ ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
 }

 template <typename T>
-C10_EXPORT void BothEndsMoments(
-    const int pre,
-    const int mid,
-    const int nxt,
+void BothEndsMoments(
+    const int M,
+    const int N,
+    const int K,
    const T* X,
    T* mean,
    T* var) {
-  std::memset(mean, 0, sizeof(T) * mid);
-  std::memset(var, 0, sizeof(T) * mid);
-  EigenVectorArrayMap<T> mean_arr(mean, mid);
-  EigenVectorArrayMap<T> var_arr(var, mid);
-  ConstEigenArrayMap<T> X_arr(X, nxt, pre * mid);
-  for (int i = 0; i < pre; ++i) {
-    for (int j = 0; j < mid; ++j) {
-      const int c = i * mid + j;
-      mean_arr(j) += X_arr.col(c).sum();
-      var_arr(j) += X_arr.col(c).square().sum();
-    }
+  EigenVectorArrayMap<T> mean_arr(mean, N);
+  EigenVectorArrayMap<T> var_arr(var, N);
+  ConstEigenArrayMap<T> X0_arr(X, K, N);
+  mean_arr = X0_arr.colwise().sum();
+  var_arr = X0_arr.square().colwise().sum();
+  for (int i = 1; i < M; ++i) {
+    ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
+    mean_arr += X_arr.colwise().sum();
+    var_arr += X_arr.square().colwise().sum();
  }
-  const T scale = T(1) / static_cast<T>(pre * nxt);
+  const T scale = T(1) / static_cast<T>(M * K);
  mean_arr *= scale;
  var_arr = var_arr * scale - mean_arr.square();
 }

 template <typename T>
-C10_EXPORT void MomentsImpl(
+void MomentsImpl(
    const int ndim,
    const int* X_dims,
    const int* Y_dims,
@ -126,6 +513,128 @@ C10_EXPORT void MomentsImpl(

 } // namespace

+#define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \
+  template <>                                               \
+  C10_EXPORT void Func<T, CPUContext>(                      \
+      const int N,                                          \
+      const T* X,                                           \
+      T* Y,                                                 \
+      Tensor* /* scratch_ptr */,                            \
+      CPUContext* /* context */) {                          \
+    *Y = ConstEigenVectorArrayMap<T>(X, N).EigenFunc();     \
+  }
+DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMin, minCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMin, minCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMin, minCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMax, maxCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMax, maxCoeff)
+DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMax, maxCoeff)
+#undef DELEGATE_GLOBAL_REDUCE_FUNCTION
+
+#define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm)                  \
+  template <>                                                              \
+  C10_EXPORT void Func<T, CPUContext>(                                     \
+      const int ndim,                                                      \
+      const int* X_dims,                                                   \
+      const int* Y_dims,                                                   \
+      const T alpha,                                                       \
+      const T* X,                                                          \
+      T* Y,                                                                \
+      CPUContext* context) {                                               \
+    const int X_size =                                                     \
+        std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); \
+    const int Y_size =                                                     \
+        std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); \
+    if (X_size == 0) {                                                     \
+      Set<T, CPUContext>(Y_size, alpha * kInit, Y, context);               \
+      return;                                                              \
+    }                                                                      \
+    if (alpha == T(0)) {                                                   \
+      std::memset(Y, 0, sizeof(T) * Y_size);                               \
+      return;                                                              \
+    }                                                                      \
+    if (std::equal(X_dims, X_dims + ndim, Y_dims)) {                       \
+      if (kIsNorm) {                                                       \
+        EigenVectorArrayMap<T>(Y, Y_size) =                                \
+            ConstEigenVectorArrayMap<T>(X, X_size).abs() * alpha;          \
+      } else {                                                             \
+        Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context);             \
+      }                                                                    \
+      return;                                                              \
+    }                                                                      \
+    int rows;                                                              \
+    int cols;                                                              \
+    if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {      \
+      Rowwise##Func<T>(rows, cols, alpha, X, Y, context);                  \
+      return;                                                              \
+    }                                                                      \
+    if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {      \
+      Colwise##Func<T>(rows, cols, alpha, X, Y, context);                  \
+      return;                                                              \
+    }                                                                      \
+    int M;                                                                 \
+    int N;                                                                 \
+    int K;                                                                 \
+    if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) {       \
+      BothEnds##Func<T>(M, N, K, alpha, X, Y, context);                    \
+      return;                                                              \
+    }                                                                      \
+    Func##Impl<T>(ndim, X_dims, Y_dims, alpha, X, Y, context);             \
+  }
+DELEGATE_REDUCE_FUNCTION(
+    float,
+    ReduceMin,
+    std::numeric_limits<float>::max(),
+    false)
+DELEGATE_REDUCE_FUNCTION(
+    double,
+    ReduceMin,
+    std::numeric_limits<double>::max(),
+    false)
+DELEGATE_REDUCE_FUNCTION(
+    std::int32_t,
+    ReduceMin,
+    std::numeric_limits<std::int32_t>::max(),
+    false)
+DELEGATE_REDUCE_FUNCTION(
+    std::int64_t,
+    ReduceMin,
+    std::numeric_limits<std::int64_t>::max(),
+    false)
+DELEGATE_REDUCE_FUNCTION(
+    float,
+    ReduceMax,
+    std::numeric_limits<float>::lowest(),
+    false)
+DELEGATE_REDUCE_FUNCTION(
+    double,
+    ReduceMax,
+    std::numeric_limits<double>::lowest(),
+    false)
+DELEGATE_REDUCE_FUNCTION(
+    std::int32_t,
+    ReduceMax,
+    std::numeric_limits<std::int32_t>::lowest(),
+    false)
+DELEGATE_REDUCE_FUNCTION(
+    std::int64_t,
+    ReduceMax,
+    std::numeric_limits<std::int64_t>::lowest(),
+    false)
+DELEGATE_REDUCE_FUNCTION(float, ReduceSum, 0.0f, false)
+DELEGATE_REDUCE_FUNCTION(double, ReduceSum, 0.0, false)
+DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceSum, 0, false)
+DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceSum, 0LL, false)
+DELEGATE_REDUCE_FUNCTION(float, ReduceMean, 0.0f, false)
+DELEGATE_REDUCE_FUNCTION(double, ReduceMean, 0.0, false)
+DELEGATE_REDUCE_FUNCTION(float, ReduceL1, 0.0f, true)
+DELEGATE_REDUCE_FUNCTION(double, ReduceL1, 0.0, true)
+DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceL1, 0, true)
+DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceL1, 0LL, true)
+DELEGATE_REDUCE_FUNCTION(float, ReduceL2, 0.0f, true)
+DELEGATE_REDUCE_FUNCTION(double, ReduceL2, 0.0, true)
+#undef DELEGATE_REDUCE_FUNCTION
+
 #define CAFFE2_SPECIALIZED_MOMENTS(T)                            \
  template <>                                                    \
  C10_EXPORT void Moments<T, CPUContext>(                        \
--- a/caffe2/utils/math/reduce.h
+++ b/caffe2/utils/math/reduce.h
@ -5,8 +5,90 @@
 #include "caffe2/core/types.h"

 namespace caffe2 {
+
+class Tensor;
+
 namespace math {

+template <typename T, class Context>
+CAFFE2_API void
+ReduceMin(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
+
+template <typename T, class Context>
+CAFFE2_API void
+ReduceMax(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
+
+// In all of the reduce functions, X_dims and Y_dims should have ndim elements.
+// Each dimension of Y_dims must match the corresponding dimension of X_dims or
+// must be equal to 1. The dimensions equal to 1 indicate the dimensions of X to
+// be reduced.
+
+// Y = alpha * ReduceMin(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceMin(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Y = alpha * ReduceMax(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceMax(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Y = alpha * ReduceSum(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceSum(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Y = alpha * ReduceMean(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceMean(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Y = alpha * ReduceL1(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceL1(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    Context* context);
+
+// Y = alpha * ReduceL2(X)
+template <typename T, class Context>
+CAFFE2_API void ReduceL2(
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
+    const T alpha,
+    const T* X,
+    T* Y,
+    Context* context);
+
 // Computes mean and variance over axes.
 template <typename T, class Context>
 CAFFE2_API void Moments(
@ -19,6 +101,7 @@ CAFFE2_API void Moments(
    Context* context);

 } // namespace math
+
 } // namespace caffe2

 #endif // CAFFE2_UTILS_MATH_REDUCE_H_
--- a/caffe2/utils/math_cpu.cc
+++ b/caffe2/utils/math_cpu.cc
@ -375,40 +375,6 @@ C10_EXPORT void Gemv<float, CPUContext>(
  cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
 }

-#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix)          \
-  template <>                                                    \
-  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(              \
-      const int n,                                               \
-      const TAlpha alpha,                                        \
-      const TData* x,                                            \
-      TData* y,                                                  \
-      CPUContext*) {                                             \
-    if (y != x) {                                                \
-      cblas_##prefix##copy(n, x, 1, y, 1);                       \
-    }                                                            \
-    if (alpha != TAlpha(1)) {                                    \
-      cblas_##prefix##scal(n, static_cast<TData>(alpha), y, 1);  \
-    }                                                            \
-  }                                                              \
-  template <>                                                    \
-  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(              \
-      const int n,                                               \
-      const TAlpha* alpha,                                       \
-      const TData* x,                                            \
-      TData* y,                                                  \
-      CPUContext*) {                                             \
-    if (y != x) {                                                \
-      cblas_##prefix##copy(n, x, 1, y, 1);                       \
-    }                                                            \
-    if (*alpha != TAlpha(1)) {                                   \
-      cblas_##prefix##scal(n, static_cast<TData>(*alpha), y, 1); \
-    }                                                            \
-  }
-CAFFE2_SPECIALIZED_SCALE(float, float, s)
-CAFFE2_SPECIALIZED_SCALE(double, double, d)
-CAFFE2_SPECIALIZED_SCALE(float, double, d)
-#undef CAFFE2_SPECIALIZED_SCALE
-
 #define CAFFE2_SPECIALIZED_DOT(T, prefix)                       \
  template <>                                                   \
  C10_EXPORT void Dot<T, CPUContext>(                           \
@ -486,36 +452,6 @@ CAFFE2_SPECIALIZED_AXPBY(float, s)

 #endif // CAFFE2_USE_EIGEN_FOR_BLAS

-#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData)                        \
-  template <>                                                          \
-  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                    \
-      const int n,                                                     \
-      const TAlpha alpha,                                              \
-      const TData* x,                                                  \
-      TData* y,                                                        \
-      CPUContext* /* context */) {                                     \
-    EigenVectorMap<TData>(y, n) =                                      \
-        ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha);  \
-  }                                                                    \
-  template <>                                                          \
-  C10_EXPORT void Scale<TAlpha, TData, CPUContext>(                    \
-      const int n,                                                     \
-      const TAlpha* alpha,                                             \
-      const TData* x,                                                  \
-      TData* y,                                                        \
-      CPUContext* /* context */) {                                     \
-    EigenVectorMap<TData>(y, n) =                                      \
-        ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(*alpha); \
-  }
-#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
-CAFFE2_SPECIALIZED_SCALE(float, float)
-CAFFE2_SPECIALIZED_SCALE(double, double)
-CAFFE2_SPECIALIZED_SCALE(float, double)
-#endif // CAFFE2_USE_EIGEN_FOR_BLAS
-CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
-CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
-#undef CAFFE2_SPECIALIZED_SCALE
-
 template <>
 C10_EXPORT void GemmBatched<float, CPUContext>(
    const CBLAS_TRANSPOSE trans_A,
@ -628,563 +564,6 @@ C10_EXPORT void GemmStridedBatched<float, CPUContext>(
 // Eigen or via custom code.
 ////////////////////////////////////////////////////////////////////////////////

-#define CAFFE2_SPECIALIZED_SET(T)                         \
-  template <>                                             \
-  C10_EXPORT void Set<T, CPUContext>(                     \
-      const size_t N, const T alpha, T* Y, CPUContext*) { \
-    if (N == 0) {                                         \
-      return;                                             \
-    }                                                     \
-    if (alpha == (T)0) {                                  \
-      if (Y != nullptr) {                                 \
-        std::memset(Y, 0, N * sizeof(T));                 \
-      }                                                   \
-    } else {                                              \
-      EigenVectorMap<T>(Y, N).setConstant(alpha);         \
-    }                                                     \
-  }
-
-CAFFE2_SPECIALIZED_SET(float);
-CAFFE2_SPECIALIZED_SET(double);
-CAFFE2_SPECIALIZED_SET(int8_t);
-CAFFE2_SPECIALIZED_SET(int16_t);
-CAFFE2_SPECIALIZED_SET(int);
-CAFFE2_SPECIALIZED_SET(int64_t);
-CAFFE2_SPECIALIZED_SET(bool);
-CAFFE2_SPECIALIZED_SET(char);
-CAFFE2_SPECIALIZED_SET(uint8_t);
-CAFFE2_SPECIALIZED_SET(uint16_t);
-#undef CAFFE2_SPECIALIZED_SET
-
-#define CAFFE2_SPECIALIZED_REDUCEMIN(T)                \
-  template <>                                          \
-  C10_EXPORT void ReduceMin<T, CPUContext>(            \
-      const int N,                                     \
-      const T* x,                                      \
-      T* y,                                            \
-      Tensor* /*scratch_ptr*/,                         \
-      CPUContext* /*context*/) {                       \
-    *y = ConstEigenVectorArrayMap<T>(x, N).minCoeff(); \
-  }
-CAFFE2_SPECIALIZED_REDUCEMIN(float)
-#undef CAFFE2_SPECIALIZED_REDUCEMIN
-
-#define CAFFE2_SPECIALIZED_REDUCEMAX(T)                \
-  template <>                                          \
-  C10_EXPORT void ReduceMax<T, CPUContext>(            \
-      const int N,                                     \
-      const T* x,                                      \
-      T* y,                                            \
-      Tensor* /*scratch_ptr*/,                         \
-      CPUContext* /*context*/) {                       \
-    *y = ConstEigenVectorArrayMap<T>(x, N).maxCoeff(); \
-  }
-CAFFE2_SPECIALIZED_REDUCEMAX(float)
-CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
-CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
-
-#undef CAFFE2_SPECIALIZED_REDUCEMAX
-
-namespace {
-
-template <typename T>
-struct MinFunctor {
-  inline T operator()(const T a, const T b) const {
-    return std::min(a, b);
-  }
-};
-
-template <typename T>
-struct MaxFunctor {
-  inline T operator()(const T a, const T b) const {
-    return std::max(a, b);
-  }
-};
-
-template <typename T>
-struct L1NormFunctor {
-  inline T operator()(const T a, const T b) const {
-    return a + std::abs(b);
-  }
-};
-
-template <typename T>
-struct SquaredL2NormFunctor {
-  inline T operator()(const T a, const T b) const {
-    return a + b * b;
-  }
-};
-
-#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp)                    \
-  template <typename T>                                                    \
-  C10_EXPORT void Rowwise##Func(                                           \
-      const int rows, const int cols, const T alpha, const T* X, T* Y) {   \
-    EigenVectorMap<T>(Y, rows) =                                           \
-        ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \
-  }
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
-DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
-#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
-
-#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp)                    \
-  template <typename T>                                                    \
-  C10_EXPORT void Colwise##Func(                                           \
-      const int rows, const int cols, const T alpha, const T* X, T* Y) {   \
-    EigenVectorMap<T>(Y, cols) =                                           \
-        ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \
-  }
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, sum)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMean, mean)
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
-DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm)
-#undef DELEGATE_COLWISE_REDUCE_FUNCTION
-
-template <typename T>
-C10_EXPORT void BothEndsReduceMin(
-    const int pre,
-    const int mid,
-    const int nxt,
-    const T alpha,
-    const T* X,
-    T* Y) {
-  EigenVectorArrayMap<T> Y_arr(Y, mid);
-  Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().minCoeff();
-  const T* X_ptr = X + mid * nxt;
-  // It seems there is some bug in eigen array::min so it cannot be implemented
-  // as ReduceSum below.
-  for (int i = 1; i < pre; ++i) {
-    for (int j = 0; j < mid; ++j) {
-      Y[j] = std::min(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).minCoeff());
-      X_ptr += nxt;
-    }
-  }
-  if (alpha != T(1)) {
-    Y_arr *= alpha;
-  }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceMax(
-    const int pre,
-    const int mid,
-    const int nxt,
-    const T alpha,
-    const T* X,
-    T* Y) {
-  EigenVectorArrayMap<T> Y_arr(Y, mid);
-  Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().maxCoeff();
-  const T* X_ptr = X + mid * nxt;
-  for (int i = 1; i < pre; ++i) {
-    for (int j = 0; j < mid; ++j) {
-      Y[j] = std::max(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).maxCoeff());
-      X_ptr += nxt;
-    }
-  }
-  if (alpha != T(1)) {
-    Y_arr *= alpha;
-  }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceSum(
-    const int pre,
-    const int mid,
-    const int nxt,
-    const T alpha,
-    const T* X,
-    T* Y) {
-  EigenVectorArrayMap<T> Y_arr(Y, mid);
-  Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().sum();
-  const int stride = mid * nxt;
-  const T* X_ptr = X + stride;
-  for (int i = 1; i < pre; ++i) {
-    Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().sum();
-    X_ptr += stride;
-  }
-  if (alpha != T(1)) {
-    Y_arr *= alpha;
-  }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceMean(
-    const int pre,
-    const int mid,
-    const int nxt,
-    const T alpha,
-    const T* X,
-    T* Y) {
-  EigenVectorArrayMap<T> Y_arr(Y, mid);
-  Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().mean();
-  const int stride = mid * nxt;
-  const T* X_ptr = X + stride;
-  for (int i = 1; i < pre; ++i) {
-    Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().mean();
-    X_ptr += stride;
-  }
-  if (alpha / static_cast<T>(pre) != 1) {
-    Y_arr *= alpha / static_cast<T>(pre);
-  }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceL1(
-    const int pre,
-    const int mid,
-    const int nxt,
-    const T alpha,
-    const T* X,
-    T* Y) {
-  EigenVectorArrayMap<T> Y_arr(Y, mid);
-  Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid)
-              .colwise()
-              .template lpNorm<1>()
-              .array();
-  const int stride = mid * nxt;
-  const T* X_ptr = X + stride;
-  for (int i = 1; i < pre; ++i) {
-    Y_arr += ConstEigenMatrixMap<T>(X_ptr, nxt, mid)
-                 .colwise()
-                 .template lpNorm<1>()
-                 .array();
-    X_ptr += stride;
-  }
-  if (alpha != T(1)) {
-    Y_arr *= alpha;
-  }
-}
-
-template <typename T>
-C10_EXPORT void BothEndsReduceL2(
-    const int pre,
-    const int mid,
-    const int nxt,
-    const T alpha,
-    const T* X,
-    T* Y) {
-  EigenVectorArrayMap<T> Y_arr(Y, mid);
-  Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid).colwise().squaredNorm().array();
-  const int stride = mid * nxt;
-  const T* X_ptr = X + stride;
-  for (int i = 1; i < pre; ++i) {
-    Y_arr +=
-        ConstEigenMatrixMap<T>(X_ptr, nxt, mid).colwise().squaredNorm().array();
-    X_ptr += stride;
-  }
-  Y_arr = Y_arr.sqrt() * alpha;
-}
-
-template <typename T, class Reducer>
-C10_EXPORT void ReduceTensor(
-    const int ndim,
-    const int* X_dims,
-    const int* Y_dims,
-    const Reducer& reducer,
-    const T init,
-    const T alpha,
-    const T* X,
-    T* Y,
-    CPUContext* context) {
-  const int X_size =
-      std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
-  const int Y_size =
-      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
-  Set<T, CPUContext>(Y_size, init, Y, context);
-  std::vector<int> index(ndim, 0);
-  for (int X_index = 0; X_index < X_size; ++X_index) {
-    const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
-    Y[Y_index] = reducer(Y[Y_index], X[X_index]);
-    utils::IncreaseIndexInDims(ndim, X_dims, index.data());
-  }
-  Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
-}
-
-} // namespace
-
-#define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm)              \
-  template <>                                                                  \
-  C10_EXPORT void Func<T, CPUContext>(                                         \
-      const int num_dims,                                                      \
-      const int* dims,                                                         \
-      const int num_axes,                                                      \
-      const int* axes,                                                         \
-      const T alpha,                                                           \
-      const T* X,                                                              \
-      T* Y,                                                                    \
-      CPUContext* context) {                                                   \
-    CAFFE_ENFORCE_LE(num_axes, num_dims);                                      \
-    std::vector<int> Y_dims_vector(dims, dims + num_dims);                     \
-    for (int i = 0; i < num_axes; ++i) {                                       \
-      Y_dims_vector[axes[i]] = 1;                                              \
-    }                                                                          \
-    const int* X_dims = dims;                                                  \
-    const int* Y_dims = Y_dims_vector.data();                                  \
-    const int X_size =                                                         \
-        std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
-    const int Y_size =                                                         \
-        std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
-    if (X_size == 0) {                                                         \
-      Set<T, CPUContext>(Y_size, alpha * init, Y, context);                    \
-      return;                                                                  \
-    }                                                                          \
-    if (alpha == T(0)) {                                                       \
-      Set<T, CPUContext>(Y_size, 0, Y, context);                               \
-      return;                                                                  \
-    }                                                                          \
-    if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {                       \
-      if (is_norm) {                                                           \
-        Abs<T, CPUContext>(X_size, X, Y, context);                             \
-        Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);                 \
-      } else {                                                                 \
-        Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context);                 \
-      }                                                                        \
-      return;                                                                  \
-    }                                                                          \
-    int rows;                                                                  \
-    int cols;                                                                  \
-    if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {      \
-      Rowwise##Func<T>(rows, cols, alpha, X, Y);                               \
-      return;                                                                  \
-    }                                                                          \
-    if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {      \
-      Colwise##Func<T>(rows, cols, alpha, X, Y);                               \
-      return;                                                                  \
-    }                                                                          \
-    int pre;                                                                   \
-    int mid;                                                                   \
-    int nxt;                                                                   \
-    if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
-      BothEnds##Func<T>(pre, mid, nxt, alpha, X, Y);                           \
-      return;                                                                  \
-    }                                                                          \
-    ReduceTensor(                                                              \
-        num_dims, X_dims, Y_dims, reducer, init, alpha, X, Y, context);        \
-  }
-
-DELEGATE_REDUCE_FUNCTION(
-    float,
-    ReduceMin,
-    MinFunctor<float>(),
-    std::numeric_limits<float>::max(),
-    false)
-DELEGATE_REDUCE_FUNCTION(
-    double,
-    ReduceMin,
-    MinFunctor<double>(),
-    std::numeric_limits<double>::max(),
-    false)
-DELEGATE_REDUCE_FUNCTION(
-    std::int32_t,
-    ReduceMin,
-    MinFunctor<std::int32_t>(),
-    std::numeric_limits<std::int32_t>::max(),
-    false)
-DELEGATE_REDUCE_FUNCTION(
-    std::int64_t,
-    ReduceMin,
-    MinFunctor<std::int64_t>(),
-    std::numeric_limits<std::int64_t>::max(),
-    false)
-
-DELEGATE_REDUCE_FUNCTION(
-    float,
-    ReduceMax,
-    MaxFunctor<float>(),
-    std::numeric_limits<float>::lowest(),
-    false)
-DELEGATE_REDUCE_FUNCTION(
-    double,
-    ReduceMax,
-    MaxFunctor<double>(),
-    std::numeric_limits<double>::lowest(),
-    false)
-DELEGATE_REDUCE_FUNCTION(
-    std::int32_t,
-    ReduceMax,
-    MaxFunctor<std::int32_t>(),
-    std::numeric_limits<std::int32_t>::lowest(),
-    false)
-DELEGATE_REDUCE_FUNCTION(
-    std::int64_t,
-    ReduceMax,
-    MaxFunctor<std::int64_t>(),
-    std::numeric_limits<std::int64_t>::lowest(),
-    false)
-
-DELEGATE_REDUCE_FUNCTION(float, ReduceSum, std::plus<float>(), 0.0f, false)
-DELEGATE_REDUCE_FUNCTION(double, ReduceSum, std::plus<double>(), 0.0, false)
-DELEGATE_REDUCE_FUNCTION(
-    std::int32_t,
-    ReduceSum,
-    std::plus<std::int32_t>(),
-    0,
-    false)
-DELEGATE_REDUCE_FUNCTION(
-    std::int64_t,
-    ReduceSum,
-    std::plus<std::int64_t>(),
-    std::int64_t(0),
-    false)
-
-DELEGATE_REDUCE_FUNCTION(float, ReduceL1, L1NormFunctor<float>(), 0.0f, true)
-DELEGATE_REDUCE_FUNCTION(double, ReduceL1, L1NormFunctor<double>(), 0.0, true)
-DELEGATE_REDUCE_FUNCTION(
-    std::int32_t,
-    ReduceL1,
-    L1NormFunctor<std::int32_t>(),
-    0,
-    true)
-DELEGATE_REDUCE_FUNCTION(
-    std::int64_t,
-    ReduceL1,
-    L1NormFunctor<std::int64_t>(),
-    std::int64_t(0),
-    true)
-
-#undef DELEGATE_REDUCE_FUNCTION
-
-#define CAFFE2_SPECIALIZED_REDUCE_MEAN(T)                                      \
-  template <>                                                                  \
-  C10_EXPORT void ReduceMean<T, CPUContext>(                                   \
-      const int num_dims,                                                      \
-      const int* dims,                                                         \
-      const int num_axes,                                                      \
-      const int* axes,                                                         \
-      const T alpha,                                                           \
-      const T* X,                                                              \
-      T* Y,                                                                    \
-      CPUContext* context) {                                                   \
-    CAFFE_ENFORCE_LE(num_axes, num_dims);                                      \
-    std::vector<int> Y_dims_vector(dims, dims + num_dims);                     \
-    for (int i = 0; i < num_axes; ++i) {                                       \
-      Y_dims_vector[axes[i]] = 1;                                              \
-    }                                                                          \
-    const int* X_dims = dims;                                                  \
-    const int* Y_dims = Y_dims_vector.data();                                  \
-    const int X_size =                                                         \
-        std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
-    const int Y_size =                                                         \
-        std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
-    if (X_size == 0) {                                                         \
-      Set<T, CPUContext>(Y_size, 0, Y, context);                               \
-      return;                                                                  \
-    }                                                                          \
-    if (alpha == T(0)) {                                                       \
-      Set<T, CPUContext>(Y_size, 0, Y, context);                               \
-      return;                                                                  \
-    }                                                                          \
-    if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {                       \
-      Scale<T, T, CPUContext>(X_size, alpha, X, Y, context);                   \
-      return;                                                                  \
-    }                                                                          \
-    int rows;                                                                  \
-    int cols;                                                                  \
-    if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {      \
-      RowwiseReduceMean<T>(rows, cols, alpha, X, Y);                           \
-      return;                                                                  \
-    }                                                                          \
-    if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {      \
-      ColwiseReduceMean<T>(rows, cols, alpha, X, Y);                           \
-      return;                                                                  \
-    }                                                                          \
-    int pre;                                                                   \
-    int mid;                                                                   \
-    int nxt;                                                                   \
-    if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
-      BothEndsReduceMean<T>(pre, mid, nxt, alpha, X, Y);                       \
-      return;                                                                  \
-    }                                                                          \
-    const int scale = X_size / Y_size;                                         \
-    ReduceTensor(                                                              \
-        num_dims,                                                              \
-        X_dims,                                                                \
-        Y_dims,                                                                \
-        std::plus<T>(),                                                        \
-        T(0),                                                                  \
-        alpha / static_cast<T>(scale),                                         \
-        X,                                                                     \
-        Y,                                                                     \
-        context);                                                              \
-  }
-CAFFE2_SPECIALIZED_REDUCE_MEAN(float)
-CAFFE2_SPECIALIZED_REDUCE_MEAN(double)
-#undef CAFFE2_SPECIALIZED_REDUCE_MEAN
-
-#define CAFFE2_SPECIALIZED_REDUCE_L2(T)                                        \
-  template <>                                                                  \
-  C10_EXPORT void ReduceL2<T, CPUContext>(                                     \
-      const int num_dims,                                                      \
-      const int* dims,                                                         \
-      const int num_axes,                                                      \
-      const int* axes,                                                         \
-      const T alpha,                                                           \
-      const T* X,                                                              \
-      T* Y,                                                                    \
-      CPUContext* context) {                                                   \
-    CAFFE_ENFORCE_LE(num_axes, num_dims);                                      \
-    std::vector<int> Y_dims_vector(dims, dims + num_dims);                     \
-    for (int i = 0; i < num_axes; ++i) {                                       \
-      Y_dims_vector[axes[i]] = 1;                                              \
-    }                                                                          \
-    const int* X_dims = dims;                                                  \
-    const int* Y_dims = Y_dims_vector.data();                                  \
-    const int X_size =                                                         \
-        std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
-    const int Y_size =                                                         \
-        std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
-    if (X_size == 0) {                                                         \
-      Set<T, CPUContext>(Y_size, 0, Y, context);                               \
-      return;                                                                  \
-    }                                                                          \
-    if (alpha == T(0)) {                                                       \
-      Set<T, CPUContext>(Y_size, 0, Y, context);                               \
-      return;                                                                  \
-    }                                                                          \
-    if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {                       \
-      Abs<T, CPUContext>(X_size, X, Y, context);                               \
-      Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);                   \
-      return;                                                                  \
-    }                                                                          \
-    int rows;                                                                  \
-    int cols;                                                                  \
-    if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {      \
-      RowwiseReduceL2<T>(rows, cols, alpha, X, Y);                             \
-      return;                                                                  \
-    }                                                                          \
-    if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {      \
-      ColwiseReduceL2<T>(rows, cols, alpha, X, Y);                             \
-      return;                                                                  \
-    }                                                                          \
-    int pre;                                                                   \
-    int mid;                                                                   \
-    int nxt;                                                                   \
-    if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
-      BothEndsReduceL2<T>(pre, mid, nxt, alpha, X, Y);                         \
-      return;                                                                  \
-    }                                                                          \
-    ReduceTensor(                                                              \
-        num_dims,                                                              \
-        X_dims,                                                                \
-        Y_dims,                                                                \
-        SquaredL2NormFunctor<T>(),                                             \
-        T(0),                                                                  \
-        T(1),                                                                  \
-        X,                                                                     \
-        Y,                                                                     \
-        context);                                                              \
-    Sqrt<T, CPUContext>(Y_size, Y, Y, context);                                \
-    Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);                     \
-  }
-CAFFE2_SPECIALIZED_REDUCE_L2(float)
-CAFFE2_SPECIALIZED_REDUCE_L2(double)
-#undef CAFFE2_SPECIALIZED_REDUCE_L2
-
 namespace {

 template <typename T>
--- a/caffe2/utils/math_gpu.cu
+++ b/caffe2/utils/math_gpu.cu
@ -1482,7 +1482,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) {
 #define CAFFE2_SPECIALIZED_CUDA_SET(T)                              \
  template <>                                                       \
  CAFFE2_CUDA_API void Set<T, CUDAContext>(                         \
-      const size_t N, const T alpha, T* Y, CUDAContext* context) {  \
+      const int N, const T alpha, T* Y, CUDAContext* context) {     \
    if (N == 0) {                                                   \
      return;                                                       \
    }                                                               \
@ -1510,7 +1510,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);

 template <>
 CAFFE2_CUDA_EXPORT void Set<at::Half, CUDAContext>(
-    const size_t N,
+    const int N,
    const at::Half alpha,
    at::Half* Y,
    CUDAContext* context) {
@ -3356,27 +3356,19 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl(

 template <typename T, class Reducer>
 CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
-    const int num_dims,
-    const int* dims,
-    const int num_axes,
-    const int* axes,
+    const int ndim,
+    const int* X_dims,
+    const int* Y_dims,
    const Reducer& reducer,
    const T init,
    const T alpha,
    const T* X,
    T* Y,
    CUDAContext* context) {
-  CAFFE_ENFORCE_LE(num_axes, num_dims);
-  std::vector<int> Y_dims_vector(dims, dims + num_dims);
-  for (int i = 0; i < num_axes; ++i) {
-    Y_dims_vector[axes[i]] = 1;
-  }
-  const int* X_dims = dims;
-  const int* Y_dims = Y_dims_vector.data();
  const int X_size =
-      std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>());
+      std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
  const int Y_size =
-      std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>());
+      std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
  if (X_size == 0) {
    Set<T, CUDAContext>(Y_size, alpha * init, Y, context);
    return;
@ -3385,13 +3377,13 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
    Set<T, CUDAContext>(Y_size, T(0), Y, context);
    return;
  }
-  if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {
+  if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
    Scale<T, T, CUDAContext>(X_size, alpha, X, Y, context);
    return;
  }
  int rows;
  int cols;
-  if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
+  if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
    RowwiseReduceKernel<T>
        <<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
           CAFFE_CUDA_NUM_THREADS,
@ -3399,7 +3391,7 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
           context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
    return;
  }
-  if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
+  if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
    ColwiseReduceKernel<T>
        <<<std::min(cols, CAFFE_MAXIMUM_NUM_BLOCKS),
           CAFFE_CUDA_NUM_THREADS,
@ -3407,20 +3399,19 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
           context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
    return;
  }
-  std::vector<int> transpose_axes(num_dims);
-  utils::ComputeTransposeAxesForReduceOp(
-      num_dims, num_axes, axes, transpose_axes.data());
+  std::vector<int> axes(ndim);
+  utils::ComputeTransposeAxesForReduceOp(ndim, Y_dims, axes.data());
  const int outer_size = Y_size;
  const int inner_size = X_size / Y_size;
  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
-      num_dims,
+      ndim,
      ReduceTensorCUDAImpl,
      T,
      Reducer,
      outer_size,
      inner_size,
-      dims,
-      transpose_axes.data(),
+      X_dims,
+      axes.data(),
      reducer,
      init,
      alpha,
@ -3434,19 +3425,17 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T)        \
  template <>                                        \
  CAFFE2_CUDA_EXPORT void ReduceMin<T, CUDAContext>( \
-      const int num_dims,                            \
-      const int* dims,                               \
-      const int num_axes,                            \
-      const int* axes,                               \
+      const int ndim,                                \
+      const int* X_dims,                             \
+      const int* Y_dims,                             \
      const T alpha,                                 \
      const T* X,                                    \
      T* Y,                                          \
      CUDAContext* context) {                        \
    ReduceTensorCUDA(                                \
-        num_dims,                                    \
-        dims,                                        \
-        num_axes,                                    \
-        axes,                                        \
+        ndim,                                        \
+        X_dims,                                      \
+        Y_dims,                                      \
        cub::Min(),                                  \
        std::numeric_limits<T>::max(),               \
        alpha,                                       \
@ -3463,19 +3452,17 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T)        \
  template <>                                        \
  CAFFE2_CUDA_EXPORT void ReduceMax<T, CUDAContext>( \
-      const int num_dims,                            \
-      const int* dims,                               \
-      const int num_axes,                            \
-      const int* axes,                               \
+      const int ndim,                                \
+      const int* X_dims,                             \
+      const int* Y_dims,                             \
      const T alpha,                                 \
      const T* X,                                    \
      T* Y,                                          \
      CUDAContext* context) {                        \
    ReduceTensorCUDA(                                \
-        num_dims,                                    \
-        dims,                                        \
-        num_axes,                                    \
-        axes,                                        \
+        ndim,                                        \
+        X_dims,                                      \
+        Y_dims,                                      \
        cub::Max(),                                  \
        std::numeric_limits<T>::lowest(),            \
        alpha,                                       \
@ -3489,28 +3476,18 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
 #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX

-#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T)        \
-  template <>                                        \
-  CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>( \
-      const int num_dims,                            \
-      const int* dims,                               \
-      const int num_axes,                            \
-      const int* axes,                               \
-      const T alpha,                                 \
-      const T* X,                                    \
-      T* Y,                                          \
-      CUDAContext* context) {                        \
-    ReduceTensorCUDA(                                \
-        num_dims,                                    \
-        dims,                                        \
-        num_axes,                                    \
-        axes,                                        \
-        cub::Sum(),                                  \
-        T(0),                                        \
-        alpha,                                       \
-        X,                                           \
-        Y,                                           \
-        context);                                    \
+#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T)                          \
+  template <>                                                          \
+  CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>(                   \
+      const int ndim,                                                  \
+      const int* X_dims,                                               \
+      const int* Y_dims,                                               \
+      const T alpha,                                                   \
+      const T* X,                                                      \
+      T* Y,                                                            \
+      CUDAContext* context) {                                          \
+    ReduceTensorCUDA(                                                  \
+        ndim, X_dims, Y_dims, cub::Sum(), T(0), alpha, X, Y, context); \
  }
 CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
@ -3521,23 +3498,23 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T)        \
  template <>                                         \
  CAFFE2_CUDA_EXPORT void ReduceMean<T, CUDAContext>( \
-      const int num_dims,                             \
-      const int* dims,                                \
-      const int num_axes,                             \
-      const int* axes,                                \
+      const int ndim,                                 \
+      const int* X_dims,                              \
+      const int* Y_dims,                              \
      const T alpha,                                  \
      const T* X,                                     \
      T* Y,                                           \
      CUDAContext* context) {                         \
    int scale = 1;                                    \
-    for (int i = 0; i < num_axes; ++i) {              \
-      scale *= dims[axes[i]];                         \
+    for (int i = 0; i < ndim; ++i) {                  \
+      if (Y_dims[i] == 1) {                           \
+        scale *= X_dims[i];                           \
+      }                                               \
    }                                                 \
    ReduceTensorCUDA(                                 \
-        num_dims,                                     \
-        dims,                                         \
-        num_axes,                                     \
-        axes,                                         \
+        ndim,                                         \
+        X_dims,                                       \
+        Y_dims,                                       \
        cub::Sum(),                                   \
        T(0),                                         \
        alpha / static_cast<T>(scale),                \
--- a/caffe2/utils/math_gpu_test.cc
+++ b/caffe2/utils/math_gpu_test.cc
@ -351,288 +351,6 @@ INSTANTIATE_TEST_CASE_P(
    GemmBatchedGPUTest,
    testing::Combine(testing::Bool(), testing::Bool()));

-class ReduceTensorGPUTest : public testing::Test {
- protected:
-  void SetUp() override {
-    if (!HasCudaGPU()) {
-      return;
-    }
-    option_.set_device_type(PROTO_CUDA);
-    cuda_context_ = make_unique<CUDAContext>(option_);
-    Blob* blob_x = ws_.CreateBlob("X");
-    Blob* blob_y = ws_.CreateBlob("Y");
-    X_ = BlobGetMutableTensor(blob_x, CUDA);
-    Y_ = BlobGetMutableTensor(blob_y, CUDA);
-  }
-
-  void SetUpData(
-      const std::vector<int>& X_dims,
-      const std::vector<int>& axes,
-      const std::vector<float>& X_data) {
-    std::vector<int> Y_dims = X_dims;
-    for (const int axis : axes) {
-      Y_dims[axis] = 1;
-    }
-    X_->Resize(X_dims);
-    Y_->Resize(Y_dims);
-    ASSERT_EQ(X_data.size(), X_->numel());
-    cuda_context_->CopyFromCPU<float>(
-        X_data.size(), X_data.data(), X_->mutable_data<float>());
-  }
-
-  void VerifyResult(const std::vector<float>& expected_output) {
-    Blob* blob_y_host = ws_.CreateBlob("Y_host");
-    auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
-    Y_host->CopyFrom(*Y_);
-    ASSERT_EQ(expected_output.size(), Y_host->numel());
-    for (std::size_t i = 0; i < expected_output.size(); ++i) {
-      EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
-    }
-  }
-
-  template <class ReduceFunc>
-  void RunRedcueTensorTest(
-      const ReduceFunc& reduce_func,
-      const std::vector<int>& X_dims,
-      const std::vector<int>& axes,
-      const std::vector<float>& X_data,
-      const std::vector<float>& Y_data) {
-    SetUpData(X_dims, axes, X_data);
-    reduce_func(
-        X_dims.size(),
-        X_dims.data(),
-        axes.size(),
-        axes.data(),
-        1.0f,
-        X_->data<float>(),
-        Y_->mutable_data<float>(),
-        cuda_context_.get());
-    VerifyResult(Y_data);
-  }
-
-  Workspace ws_;
-  DeviceOption option_;
-  std::unique_ptr<CUDAContext> cuda_context_;
-  Tensor* X_ = nullptr;
-  Tensor* Y_ = nullptr;
-};
-
-TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
-  if (!HasCudaGPU()) {
-    return;
-  }
-  const auto& reduce_min = [](const int num_dims,
-                              const int* dims,
-                              const int num_axes,
-                              const int* axes,
-                              const float alpha,
-                              const float* X,
-                              float* Y,
-                              CUDAContext* context) {
-    return math::ReduceMin<float, CUDAContext>(
-        num_dims, dims, num_axes, axes, alpha, X, Y, context);
-  };
-  // Test for 1D tensor.
-  RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {1.0f, 4.0f});
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {1.0f, 2.0f, 3.0f});
-  RunRedcueTensorTest(
-      reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 5.0f});
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 2.0f});
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 3.0f});
-}
-
-TEST_F(ReduceTensorGPUTest, ReduceMaxGPUTest) {
-  if (!HasCudaGPU()) {
-    return;
-  }
-  const auto& reduce_max = [](const int num_dims,
-                              const int* dims,
-                              const int num_axes,
-                              const int* axes,
-                              const float alpha,
-                              const float* X,
-                              float* Y,
-                              CUDAContext* context) {
-    return math::ReduceMax<float, CUDAContext>(
-        num_dims, dims, num_axes, axes, alpha, X, Y, context);
-  };
-  // Test for 1D tensor.
-  RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {3.0f, 6.0f});
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {4.0f, 5.0f, 6.0f});
-  RunRedcueTensorTest(
-      reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {4.0f, 8.0f});
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {7.0f, 8.0f});
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {6.0f, 8.0f});
-}
-
-TEST_F(ReduceTensorGPUTest, ReduceSumGPUTest) {
-  if (!HasCudaGPU()) {
-    return;
-  }
-  // Test for 1D tensor.
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CUDAContext>,
-      {3},
-      {0},
-      {1.0f, 2.0f, 3.0f},
-      {6.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CUDAContext>,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {6.0f, 15.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CUDAContext>,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {5.0f, 7.0f, 9.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CUDAContext>,
-      {2, 3},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {21.0f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CUDAContext>,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {10.0f, 26.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CUDAContext>,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {16.0f, 20.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CUDAContext>,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {14.0f, 22.0f});
-}
-
-TEST_F(ReduceTensorGPUTest, ReduceMeanGPUTest) {
-  if (!HasCudaGPU()) {
-    return;
-  }
-  // Test for 1D tensor.
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CUDAContext>,
-      {3},
-      {0},
-      {1.0f, 2.0f, 3.0f},
-      {2.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CUDAContext>,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {2.0f, 5.0f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CUDAContext>,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {2.5f, 3.5f, 4.5f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CUDAContext>,
-      {2, 3},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {3.5f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CUDAContext>,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {2.5f, 6.5f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CUDAContext>,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {4.0f, 5.0f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CUDAContext>,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {3.5f, 5.5f});
-}
-
 class BroadcastGPUTest : public testing::Test {
 protected:
  void SetUp() override {
--- a/caffe2/utils/math_test.cc
+++ b/caffe2/utils/math_test.cc
@ -426,253 +426,6 @@ TEST(MathTest, FloatToHalfConversion) {

 namespace {

-class ReduceTensorTest : public testing::Test {
- protected:
-  void SetUp() override {
-    cpu_context_ = make_unique<CPUContext>(option_);
-  }
-
-  template <class ReduceFunc>
-  void RunRedcueTensorTest(
-      const ReduceFunc& reduce_func,
-      const std::vector<int>& X_dims,
-      const std::vector<int>& axes,
-      const std::vector<float>& X_data,
-      const std::vector<float>& Y_data) {
-    std::vector<int> Y_dims = X_dims;
-    for (const int axis : axes) {
-      Y_dims[axis] = 1;
-    }
-    std::vector<int64_t> X_dims_64;
-    std::vector<int64_t> Y_dims_64;
-    std::copy(X_dims.cbegin(), X_dims.cend(), std::back_inserter(X_dims_64));
-    std::copy(Y_dims.cbegin(), Y_dims.cend(), std::back_inserter(Y_dims_64));
-    ReinitializeTensor(&X_, X_dims_64, at::dtype<float>().device(CPU));
-    ReinitializeTensor(&Y_, Y_dims_64, at::dtype<float>().device(CPU));
-    ASSERT_EQ(X_data.size(), X_.numel());
-    cpu_context_->CopyFromCPU<float>(
-        X_data.size(), X_data.data(), X_.mutable_data<float>());
-    reduce_func(
-        X_dims.size(),
-        X_dims.data(),
-        axes.size(),
-        axes.data(),
-        1.0f,
-        X_.data<float>(),
-        Y_.mutable_data<float>(),
-        cpu_context_.get());
-    ASSERT_EQ(Y_data.size(), Y_.numel());
-    for (int i = 0; i < Y_.numel(); ++i) {
-      EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
-    }
-  }
-
-  DeviceOption option_;
-  std::unique_ptr<CPUContext> cpu_context_;
-  Tensor X_;
-  Tensor Y_;
-};
-
-TEST_F(ReduceTensorTest, ReduceMinTest) {
-  const auto& reduce_min = [](const int num_dims,
-                              const int* dims,
-                              const int num_axes,
-                              const int* axes,
-                              const float alpha,
-                              const float* X,
-                              float* Y,
-                              CPUContext* context) {
-    return math::ReduceMin<float, CPUContext>(
-        num_dims, dims, num_axes, axes, alpha, X, Y, context);
-  };
-  // Test for 1D tensor.
-  RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {1.0f, 4.0f});
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {1.0f, 2.0f, 3.0f});
-  RunRedcueTensorTest(
-      reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 5.0f});
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 2.0f});
-  RunRedcueTensorTest(
-      reduce_min,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {1.0f, 3.0f});
-}
-
-TEST_F(ReduceTensorTest, ReduceMaxTest) {
-  const auto& reduce_max = [](const int num_dims,
-                              const int* dims,
-                              const int num_axes,
-                              const int* axes,
-                              const float alpha,
-                              const float* X,
-                              float* Y,
-                              CPUContext* context) {
-    return math::ReduceMax<float, CPUContext>(
-        num_dims, dims, num_axes, axes, alpha, X, Y, context);
-  };
-  // Test for 1D tensor.
-  RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {3.0f, 6.0f});
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {4.0f, 5.0f, 6.0f});
-  RunRedcueTensorTest(
-      reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {4.0f, 8.0f});
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {7.0f, 8.0f});
-  RunRedcueTensorTest(
-      reduce_max,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {6.0f, 8.0f});
-}
-
-TEST_F(ReduceTensorTest, ReduceSumTest) {
-  // Test for 1D tensor.
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CPUContext>, {3}, {0}, {1.0f, 2.0f, 3.0f}, {6.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CPUContext>,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {6.0f, 15.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CPUContext>,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {5.0f, 7.0f, 9.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CPUContext>,
-      {2, 3},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {21.0f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CPUContext>,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {10.0f, 26.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CPUContext>,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {16.0f, 20.0f});
-  RunRedcueTensorTest(
-      math::ReduceSum<float, CPUContext>,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {14.0f, 22.0f});
-}
-
-TEST_F(ReduceTensorTest, ReduceMeanTest) {
-  // Test for 1D tensor.
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CPUContext>,
-      {3},
-      {0},
-      {1.0f, 2.0f, 3.0f},
-      {2.0f});
-
-  // Test for 2D Tensor.
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CPUContext>,
-      {2, 3},
-      {1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {2.0f, 5.0f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CPUContext>,
-      {2, 3},
-      {0},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {2.5f, 3.5f, 4.5f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CPUContext>,
-      {2, 3},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
-      {3.5f});
-
-  // Test for 3D tensor.
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CPUContext>,
-      {2, 2, 2},
-      {1, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {2.5f, 6.5f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CPUContext>,
-      {2, 2, 2},
-      {0, 1},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {4.0f, 5.0f});
-  RunRedcueTensorTest(
-      math::ReduceMean<float, CPUContext>,
-      {2, 2, 2},
-      {0, 2},
-      {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-      {3.5f, 5.5f});
-}
-
 class BroadcastTest : public testing::Test {
 protected:
  void SetUp() override {