Separate reduce functions from math (#16929)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/16929

Separate CPU reduce functions from math

i-am-not-moving-c2-to-c10

Reviewed By: houseroad

Differential Revision: D13999469

fbshipit-source-id: bd628b15a6e3c1f04cc62aefffb0110690e1c0d1
This commit is contained in:
Xiaomeng Yang 2019-02-13 17:47:49 -08:00 committed by Facebook Github Bot
parent 9b7f3da74b
commit 3a34f443c5
18 changed files with 1002 additions and 1460 deletions

View File

@ -14,7 +14,7 @@ OPERATOR_SCHEMA(ATen);
namespace math {
template <>
void Set<at::Half, CPUContext>(
const size_t /*N*/,
const int /*N*/,
const at::Half h,
at::Half* v,
CPUContext* c) {

View File

@ -47,15 +47,14 @@ struct AddFunctor {
const std::vector<int> C_dims =
elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
A_dims, B_dims);
std::vector<int> A_axes;
std::vector<int> B_axes;
elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
A_dims, B_dims, &A_axes, &B_axes);
std::vector<int> A_back_dims;
std::vector<int> B_back_dims;
elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
A_dims, B_dims, &A_back_dims, &B_back_dims);
math::ReduceSum(
C_dims.size(),
C_dims.data(),
A_axes.size(),
A_axes.data(),
A_back_dims.data(),
TGrad(1),
dC,
dA,
@ -63,8 +62,7 @@ struct AddFunctor {
math::ReduceSum(
C_dims.size(),
C_dims.data(),
B_axes.size(),
B_axes.data(),
B_back_dims.data(),
TGrad(1),
dC,
dB,

View File

@ -108,5 +108,17 @@ void ComputeBinaryBroadcastBackwardAxes(
std::reverse(B_axes->begin(), B_axes->end());
}
void ComputeBinaryBroadcastBackwardDims(
const std::vector<int>& A_dims,
const std::vector<int>& B_dims,
std::vector<int>* A_back_dims,
std::vector<int>* B_back_dims) {
const int ndim = std::max(A_dims.size(), B_dims.size());
A_back_dims->assign(ndim, 1);
B_back_dims->assign(ndim, 1);
std::copy(A_dims.crbegin(), A_dims.crend(), A_back_dims->rbegin());
std::copy(B_dims.crbegin(), B_dims.crend(), B_back_dims->rbegin());
}
} // namespace elementwise_ops_utils
} // namespace caffe2

View File

@ -23,6 +23,12 @@ CAFFE2_API void ComputeBinaryBroadcastBackwardAxes(
std::vector<int>* A_axes,
std::vector<int>* B_axes);
CAFFE2_API void ComputeBinaryBroadcastBackwardDims(
const std::vector<int>& A_dims,
const std::vector<int>& B_dims,
std::vector<int>* A_back_dims,
std::vector<int>* B_back_dims);
} // namespace elementwise_ops_utils
} // namespace caffe2

View File

@ -47,15 +47,14 @@ struct SubFunctor {
const std::vector<int> C_dims =
elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
A_dims, B_dims);
std::vector<int> A_axes;
std::vector<int> B_axes;
elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
A_dims, B_dims, &A_axes, &B_axes);
std::vector<int> A_back_dims;
std::vector<int> B_back_dims;
elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
A_dims, B_dims, &A_back_dims, &B_back_dims);
math::ReduceSum(
C_dims.size(),
C_dims.data(),
A_axes.size(),
A_axes.data(),
A_back_dims.data(),
TGrad(1),
dC,
dA,
@ -63,8 +62,7 @@ struct SubFunctor {
math::ReduceSum(
C_dims.size(),
C_dims.data(),
B_axes.size(),
B_axes.data(),
B_back_dims.data(),
TGrad(-1),
dC,
dB,

View File

@ -94,11 +94,14 @@ class ExpandGradientOp final : public Operator<Context> {
axes.push_back(i);
}
}
std::vector<int> X_dims = dY_dims;
for (const int axis : axes) {
X_dims[axis] = 1;
}
math::ReduceSum<T, Context>(
dY_dims.size(),
dY_dims.data(),
axes.size(),
axes.data(),
X_dims.data(),
T(1),
dY.template data<T>(),
dX->template mutable_data<T>(),

View File

@ -572,10 +572,10 @@ bool AveragePoolFunctor<CPUContext>::
const float* X,
float* Y,
CPUContext* context) const {
const std::array<int, 2> dims = {N * C, HxW};
const int axis = 1;
const std::array<int, 2> X_dims = {N * C, HxW};
const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMean<float, CPUContext>(
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
@ -720,10 +720,10 @@ bool MaxPoolFunctor<CPUContext>::
const float* X,
float* Y,
CPUContext* context) const {
const std::array<int, 2> dims = {N * C, HxW};
const int axis = 1;
const std::array<int, 2> X_dims = {N * C, HxW};
const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMax<float, CPUContext>(
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}

View File

@ -698,10 +698,10 @@ bool AveragePoolFunctor<CUDAContext>::
const float* X,
float* Y,
CUDAContext* context) const {
const std::array<int, 2> dims = {N * C, HxW};
const int axis = 1;
const std::array<int, 2> X_dims = {N * C, HxW};
const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMean<float, CUDAContext>(
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
@ -1756,10 +1756,10 @@ bool MaxPoolFunctor<CUDAContext>::
const float* X,
float* Y,
CUDAContext* context) const {
const std::array<int, 2> dims = {N * C, HxW};
const int axis = 1;
const std::array<int, 2> X_dims = {N * C, HxW};
const std::array<int, 2> Y_dims = {N * C, 1};
math::ReduceMax<float, CUDAContext>(
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}
@ -1773,10 +1773,10 @@ bool MaxPoolFunctor<CUDAContext>::
const float* X,
float* Y,
CUDAContext* context) const {
const std::array<int, 3> dims = {N, HxW, C};
const int axis = 1;
const std::array<int, 3> X_dims = {N, HxW, C};
const std::array<int, 3> Y_dims = {N, 1, C};
math::ReduceMax<float, CUDAContext>(
3, dims.data(), 1, &axis, 1.0f, X, Y, context);
3, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
return true;
}

View File

@ -29,13 +29,13 @@ class ReduceOp final : public Operator<Context> {
template <typename T>
bool DoRunWithType() {
const auto& X = Input(0);
const int ndim = X.dim();
const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
if (axes_.empty()) {
axes_.resize(ndim);
std::iota(axes_.begin(), axes_.end(), 0);
} else {
for (auto& axis: axes_) {
for (auto& axis : axes_) {
axis = X.canonical_axis_index(axis);
}
std::sort(axes_.begin(), axes_.end());
@ -45,24 +45,29 @@ class ReduceOp final : public Operator<Context> {
ndim,
"Axes ids must be smaller than the dimensions of input.");
}
const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
std::vector<int64_t> Y_dims;
Y_dims.reserve(ndim);
std::vector<int64_t> output_dims;
output_dims.reserve(ndim);
std::size_t cur_axis = 0;
for (int i = 0; i < ndim; ++i) {
if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
if (keep_dims_) {
Y_dims.push_back(1);
output_dims.push_back(1);
}
++cur_axis;
} else {
Y_dims.push_back(X_dims[i]);
output_dims.push_back(X_dims[i]);
}
}
auto* Y = Output(0, Y_dims, at::dtype<T>());
auto* Y = Output(0, output_dims, at::dtype<T>());
std::vector<int> Y_dims = X_dims;
for (const int axis : axes_) {
Y_dims[axis] = 1;
}
return reducer_.template Forward<T>(
X_dims,
axes_,
Y_dims,
X.template data<T>(),
Y->template mutable_data<T>(),
&context_);
@ -71,7 +76,7 @@ class ReduceOp final : public Operator<Context> {
private:
std::vector<int> axes_;
const int keep_dims_;
Reducer reducer_{};
const Reducer reducer_{};
};
template <typename InputTypes, class Context, class Reducer>
@ -98,7 +103,7 @@ class ReduceGradientOp final : public Operator<Context> {
axes_.resize(ndim);
std::iota(axes_.begin(), axes_.end(), 0);
} else {
for (auto& axis: axes_) {
for (auto& axis : axes_) {
axis = X.canonical_axis_index(axis);
}
std::sort(axes_.begin(), axes_.end());
@ -126,23 +131,22 @@ class ReduceGradientOp final : public Operator<Context> {
private:
std::vector<int> axes_;
Reducer reducer_{};
const Reducer reducer_{};
};
template <class Context>
struct MinReducer {
template <typename T>
bool Forward(
const std::vector<int>& dims,
const std::vector<int>& axes,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceMin<T, Context>(
dims.size(),
dims.data(),
axes.size(),
axes.data(),
X_dims.size(),
X_dims.data(),
Y_dims.data(),
T(1),
X_data,
Y_data,
@ -165,16 +169,15 @@ template <class Context>
struct MaxReducer {
template <typename T>
bool Forward(
const std::vector<int>& dims,
const std::vector<int>& axes,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceMax<T, Context>(
dims.size(),
dims.data(),
axes.size(),
axes.data(),
X_dims.size(),
X_dims.data(),
Y_dims.data(),
T(1),
X_data,
Y_data,
@ -197,16 +200,15 @@ template <class Context>
struct SumReducer {
template <typename T>
bool Forward(
const std::vector<int>& dims,
const std::vector<int>& axes,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceSum<T, Context>(
dims.size(),
dims.data(),
axes.size(),
axes.data(),
X_dims.size(),
X_dims.data(),
Y_dims.data(),
T(1),
X_data,
Y_data,
@ -240,16 +242,15 @@ template <class Context>
struct MeanReducer {
template <typename T>
bool Forward(
const std::vector<int>& dims,
const std::vector<int>& axes,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceMean<T, Context>(
dims.size(),
dims.data(),
axes.size(),
axes.data(),
X_dims.size(),
X_dims.data(),
Y_dims.data(),
T(1),
X_data,
Y_data,
@ -287,16 +288,15 @@ template <class Context>
struct L1Reducer {
template <typename T>
bool Forward(
const std::vector<int>& dims,
const std::vector<int>& axes,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceL1<T, Context>(
dims.size(),
dims.data(),
axes.size(),
axes.data(),
X_dims.size(),
X_dims.data(),
Y_dims.data(),
T(1),
X_data,
Y_data,
@ -319,16 +319,15 @@ template <class Context>
struct L2Reducer {
template <typename T>
bool Forward(
const std::vector<int>& dims,
const std::vector<int>& axes,
const std::vector<int>& X_dims,
const std::vector<int>& Y_dims,
const T* X_data,
T* Y_data,
Context* context) const {
math::ReduceL2<T, Context>(
dims.size(),
dims.data(),
axes.size(),
axes.data(),
X_dims.size(),
X_dims.data(),
Y_dims.data(),
T(1),
X_data,
Y_data,

View File

@ -31,34 +31,34 @@ class CAFFE2_API DefaultEngine {};
namespace math {
#define C10_DECLARE_COMPARE_OP(Comp) \
template <typename T, class Context, bool kBroadcast1st = false> \
void Rowwise##Comp( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
bool* C, \
Context* context); \
\
template <typename T, class Context, bool kBroadcast1st = false> \
void Colwise##Comp( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
bool* C, \
Context* context); \
\
template <typename T, class Context> \
void Comp( \
const int A_ndim, \
const int* A_dims, \
const int B_ndim, \
const int* B_dims, \
const T* A, \
const T* B, \
bool* C, \
#define C10_DECLARE_COMPARE_OP(Comp) \
template <typename T, class Context, bool kBroadcast1st = false> \
void Rowwise##Comp( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
bool* C, \
Context* context); \
\
template <typename T, class Context, bool kBroadcast1st = false> \
void Colwise##Comp( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
bool* C, \
Context* context); \
\
template <typename T, class Context> \
void Comp( \
const int A_ndim, \
const int* A_dims, \
const int B_ndim, \
const int* B_dims, \
const T* A, \
const T* B, \
bool* C, \
Context* context);
C10_DECLARE_COMPARE_OP(EQ)
@ -115,80 +115,6 @@ C10_DECLARE_BINARY_OP(BitwiseXor)
#undef C10_DECLARE_BINARY_OP
template <typename T, class Context>
CAFFE2_API void
ReduceMin(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
template <typename T, class Context>
CAFFE2_API void
ReduceMax(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
template <typename T, class Context>
CAFFE2_API void ReduceMin(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const T alpha,
const T* X,
T* Y,
Context* context);
template <typename T, class Context>
CAFFE2_API void ReduceMax(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const T alpha,
const T* X,
T* Y,
Context* context);
template <typename T, class Context>
CAFFE2_API void ReduceSum(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const T alpha,
const T* X,
T* Y,
Context* context);
template <typename T, class Context>
CAFFE2_API void ReduceMean(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const T alpha,
const T* X,
T* Y,
Context* context);
template <typename T, class Context>
CAFFE2_API void ReduceL1(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const T alpha,
const T* X,
T* Y,
Context* context);
template <typename T, class Context>
CAFFE2_API void ReduceL2(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const T alpha,
const T* X,
T* Y,
Context* context);
// Broadcasts X with X_dims to Y with Y_dims.
template <typename T, class Context>
CAFFE2_API void Broadcast(
@ -337,9 +263,6 @@ CAFFE2_API void Gemv(
Context* context,
TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
template <typename T, class Context>
CAFFE2_API void Set(const size_t N, const T alpha, T* X, Context* context);
template <typename T, class Context>
CAFFE2_API void
RandUniform(const size_t n, const T a, const T b, T* r, Context* context);
@ -409,25 +332,6 @@ CAFFE2_API void Select(
T* y,
Context* context);
template <typename TAlpha, typename TData, class Context>
CAFFE2_API void Scale(
const int N,
const TAlpha alpha,
const TData* x,
TData* y,
Context* context);
// Different from the Scale function above, if alpha is passed in
// as a pointer, we will assume that it lives on the Context device,
// for example on GPU.
template <typename TAlpha, typename TData, class Context>
CAFFE2_API void Scale(
const int N,
const TAlpha* alpha,
const TData* x,
TData* y,
Context* context);
template <typename T, class Context>
CAFFE2_API void
Axpy(const int N, const float alpha, const T* x, T* y, Context* context);

View File

@ -3,6 +3,10 @@
#include <algorithm>
#include <functional>
#ifdef CAFFE2_USE_ACCELERATE
#include <Accelerate/Accelerate.h>
#endif // CAFFE2_USE_ACCELERATE
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL
@ -73,25 +77,25 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Erf, vsErf)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Erf, vdErf)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#define DELEGATE_SINCOS_FUNCTION(T, MKLFunc) \
#define DELEGATE_SINCOS(T, MKLFunc) \
template <> \
C10_EXPORT void SinCos<T, CPUContext>( \
const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \
MKLFunc(N, X, S, C); \
}
DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
#undef DELEGATE_SINCOS_FUNCTION
DELEGATE_SINCOS(float, vsSinCos)
DELEGATE_SINCOS(double, vdSinCos)
#undef DELEGATE_SINCOS
#define DELEGATE_POWX_FUNCTION(T, MKLFunc) \
#define DELEGATE_POWX(T, MKLFunc) \
template <> \
C10_EXPORT void Powx<T, CPUContext>( \
const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \
MKLFunc(N, A, b, Y); \
}
DELEGATE_POWX_FUNCTION(float, vsPowx)
DELEGATE_POWX_FUNCTION(double, vdPowx)
#undef DELEGATE_POWX_FUNCTION
DELEGATE_POWX(float, vsPowx)
DELEGATE_POWX(double, vdPowx)
#undef DELEGATE_POWX
#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc) \
template <> \
@ -228,6 +232,155 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /)
#endif // CAFFE2_USE_MKL
////////////////////////////////////////////////////////////////////////////////
// BLAS alternatives.
// Depending on whether we have specified an external BLAS library or not, we
// will delegate the Caffe math functions that are BLAS-related to either the
// CBLAS call or the Eigen implementation.
////////////////////////////////////////////////////////////////////////////////
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
} \
}
CAFFE2_SPECIALIZED_SCALE(float, float)
CAFFE2_SPECIALIZED_SCALE(double, double)
CAFFE2_SPECIALIZED_SCALE(float, double)
#undef CAFFE2_SPECIALIZED_SCALE
#else // CAFFE2_USE_EIGEN_FOR_BLAS
#ifdef CAFFE2_USE_MKL
#define DELEGATE_SCALE(TAlpha, TData, MKLFunc1, MKLFunc2) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (Y == X) { \
MKLFunc1(N, static_cast<TData>(alpha), Y, 1); \
} else { \
MKLFunc2(N, static_cast<TData>(alpha), X, 1, TData(0), Y, 1); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (Y == X) { \
MKLFunc1(N, static_cast<TData>(*alpha), Y, 1); \
} else { \
MKLFunc2(N, static_cast<TData>(*alpha), X, 1, TData(0), Y, 1); \
} \
}
DELEGATE_SCALE(float, float, cblas_sscal, cblas_saxpby)
DELEGATE_SCALE(double, double, cblas_dscal, cblas_daxpby)
DELEGATE_SCALE(float, double, cblas_dscal, cblas_daxpby)
#undef DELEGATE_SCALE
#else // CAFFE2_USE_MKL
#define DELEGATE_SCALE(TAlpha, TData, BLASFunc) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (Y == X) { \
BLASFunc(N, static_cast<TData>(alpha), Y, 1); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (Y == X) { \
BLASFunc(N, static_cast<TData>(*alpha), Y, 1); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
} \
}
DELEGATE_SCALE(float, float, cblas_sscal)
DELEGATE_SCALE(double, double, cblas_dscal)
DELEGATE_SCALE(float, double, cblas_dscal)
#undef DELEGATE_SCALE
#endif // CAFFE2_USE_MKL
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
////////////////////////////////////////////////////////////////////////////////
// Common math functions being used in Caffe that do not have a BLAS or MKL
// equivalent. For all these functions, we will simply implement them either via
// Eigen or via custom code.
////////////////////////////////////////////////////////////////////////////////
#define CAFFE2_SPECIALIZED_SET(T) \
template <> \
C10_EXPORT void Set<T, CPUContext>( \
const int N, const T alpha, T* Y, CPUContext* /* context */) { \
if (N == 0) { \
return; \
} \
if (alpha == T(0)) { \
std::memset(Y, 0, N * sizeof(T)); \
} else { \
EigenVectorArrayMap<T>(Y, N).setConstant(alpha); \
} \
}
CAFFE2_SPECIALIZED_SET(float)
CAFFE2_SPECIALIZED_SET(double)
CAFFE2_SPECIALIZED_SET(int)
CAFFE2_SPECIALIZED_SET(std::int8_t)
CAFFE2_SPECIALIZED_SET(std::int16_t)
CAFFE2_SPECIALIZED_SET(std::int64_t)
CAFFE2_SPECIALIZED_SET(bool)
CAFFE2_SPECIALIZED_SET(char)
CAFFE2_SPECIALIZED_SET(std::uint8_t)
CAFFE2_SPECIALIZED_SET(std::uint16_t)
#undef CAFFE2_SPECIALIZED_SET
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
@ -262,6 +415,39 @@ CAFFE2_SPECIALIZED_NEG(float)
CAFFE2_SPECIALIZED_NEG(double)
#undef CAFFE2_SPECIALIZED_NEG
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
} \
}
CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
#undef CAFFE2_SPECIALIZED_SCALE
#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
@ -286,8 +472,12 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Div, /)
EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N).EigenFunc( \
ConstEigenVectorArrayMap<T>(B, N)); \
}
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION

View File

@ -56,6 +56,19 @@ CAFFE2_API void Inv(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
CAFFE2_API void Erf(int N, const T* X, T* Y, Context* context);
template <typename T, class Context>
CAFFE2_API void Set(int N, T alpha, T* X, Context* context);
template <typename TAlpha, typename TData, class Context>
CAFFE2_API void
Scale(int N, TAlpha alpha, const TData* X, TData* Y, Context* context);
// Different from the Scale function above, if alpha is passed in as a pointer,
// we will assume that it lives on the Context device, for example on GPU.
template <typename TAlpha, typename TData, class Context>
CAFFE2_API void
Scale(int N, const TAlpha* alpha, const TData* X, TData* Y, Context* context);
template <typename T, class Context>
CAFFE2_API void Add(int N, const T* A, const T* B, T* C, Context* context);
template <typename T, class Context>

View File

@ -6,8 +6,17 @@
#include <numeric>
#include <vector>
#ifdef CAFFE2_USE_ACCELERATE
#include <Accelerate/Accelerate.h>
#endif // CAFFE2_USE_ACCELERATE
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL
#include "caffe2/core/context.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math/elementwise.h"
#include "caffe2/utils/math/utils.h"
namespace caffe2 {
@ -15,9 +24,385 @@ namespace math {
namespace {
#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc) \
template <typename T> \
void Rowwise##Func( \
const int rows, \
const int cols, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* /* context */) { \
EigenVectorMap<T>(Y, rows) = \
ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenFunc() * alpha; \
}
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
#ifndef CAFFE2_USE_EIGEN_FOR_BLAS
#define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \
template <> \
void Rowwise##Func( \
const int rows, \
const int cols, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* /* context */) { \
for (int i = 0; i < rows; ++i) { \
Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha; \
} \
}
DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL1, cblas_sasum)
DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL1, cblas_dasum)
DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL2, cblas_snrm2)
DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL2, cblas_dnrm2)
#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc) \
template <typename T> \
void Colwise##Func( \
const int rows, \
const int cols, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
std::memcpy(Y, X, sizeof(T) * cols); \
for (int i = 1; i < rows; ++i) { \
MathFunc<T, CPUContext>(cols, Y, X + i * cols, Y, context); \
} \
Scale<T, T, CPUContext>(cols, alpha, Y, Y, context); \
}
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, Add)
#undef DELEGATE_COLWISE_REDUCE_FUNCTION
template <typename T>
C10_EXPORT void
RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
void ColwiseReduceMean(
const int rows,
const int cols,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ColwiseReduceSum<T>(rows, cols, alpha / static_cast<T>(rows), X, Y, context);
}
template <typename T>
void ColwiseReduceL1(
const int rows,
const int cols,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ConstEigenArrayMap<T> X_arr(X, cols, rows);
EigenVectorArrayMap<T> Y_arr(Y, cols);
Y_arr = X_arr.col(0).abs();
for (int i = 1; i < rows; ++i) {
Y_arr += X_arr.col(i).abs();
}
Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);
}
template <typename T>
void ColwiseReduceL2(
const int rows,
const int cols,
const T alpha,
const T* X,
T* Y,
CPUContext* /* context */) {
ConstEigenArrayMap<T> X_arr(X, cols, rows);
EigenVectorArrayMap<T> Y_arr(Y, cols);
Y_arr = X_arr.col(0).square();
for (int i = 1; i < rows; ++i) {
Y_arr += X_arr.col(i).square();
}
Y_arr = Y_arr.sqrt() * alpha;
}
template <typename T>
void BothEndsReduceMin(
const int M,
const int N,
const int K,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
EigenVectorArrayMap<T> Y_arr(Y, N);
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().minCoeff();
for (int i = 1; i < M; ++i) {
ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
for (int j = 0; j < N; ++j) {
Y[j] = std::min(Y[j], X_arr.col(j).minCoeff());
}
}
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
}
template <typename T>
void BothEndsReduceMax(
const int M,
const int N,
const int K,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
EigenVectorArrayMap<T> Y_arr(Y, N);
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().maxCoeff();
for (int i = 1; i < M; ++i) {
ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
for (int j = 0; j < N; ++j) {
Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff());
}
}
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
}
template <typename T>
void BothEndsReduceSum(
const int M,
const int N,
const int K,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
EigenVectorArrayMap<T> Y_arr(Y, N);
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
for (int i = 1; i < M; ++i) {
Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum();
}
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
}
template <typename T>
void BothEndsReduceMean(
const int M,
const int N,
const int K,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
EigenVectorArrayMap<T> Y_arr(Y, N);
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().mean();
for (int i = 1; i < M; ++i) {
Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().mean();
}
Scale<T, T, CPUContext>(N, alpha / static_cast<T>(M), Y, Y, context);
}
template <typename T>
void BothEndsReduceL1(
const int M,
const int N,
const int K,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
EigenVectorMap<T> Y_vec(Y, N);
Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().template lpNorm<1>();
for (int i = 1; i < M; ++i) {
Y_vec += ConstEigenMatrixMap<T>(X + i * N * K, K, N)
.colwise()
.template lpNorm<1>();
}
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
}
template <typename T>
void BothEndsReduceL2(
const int M,
const int N,
const int K,
const T alpha,
const T* X,
T* Y,
CPUContext* /* context */) {
EigenVectorMap<T> Y_vec(Y, N);
Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().squaredNorm();
for (int i = 1; i < M; ++i) {
Y_vec +=
ConstEigenMatrixMap<T>(X + i * N * K, K, N).colwise().squaredNorm();
}
Y_vec = Y_vec.cwiseSqrt() * alpha;
}
template <typename T, class Reducer>
void ReduceTensorImpl(
const int ndim,
const int* X_dims,
const int* Y_dims,
const Reducer& reducer,
const T init,
const T* X,
T* Y,
CPUContext* context) {
const int X_size =
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Set<T, CPUContext>(Y_size, init, Y, context);
std::vector<int> index(ndim, 0);
for (int X_index = 0; X_index < X_size; ++X_index) {
const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
Y[Y_index] = reducer(Y[Y_index], X[X_index]);
utils::IncreaseIndexInDims(ndim, X_dims, index.data());
}
}
template <typename T>
void ReduceMinImpl(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ReduceTensorImpl(
ndim,
X_dims,
Y_dims,
[](const T a, const T b) { return std::min(a, b); },
std::numeric_limits<T>::max(),
X,
Y,
context);
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
}
template <typename T>
void ReduceMaxImpl(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ReduceTensorImpl(
ndim,
X_dims,
Y_dims,
[](const T a, const T b) { return std::max(a, b); },
std::numeric_limits<T>::lowest(),
X,
Y,
context);
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
}
template <typename T>
void ReduceSumImpl(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
}
template <typename T>
void ReduceMeanImpl(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
const int X_size =
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Scale<T, T, CPUContext>(
Y_size,
alpha * static_cast<T>(Y_size) / static_cast<T>(X_size),
Y,
Y,
context);
}
template <typename T>
void ReduceL1Impl(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ReduceTensorImpl(
ndim,
X_dims,
Y_dims,
[](const T a, const T b) { return a + std::abs(b); },
T(0),
X,
Y,
context);
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
}
template <typename T>
void ReduceL2Impl(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
ReduceTensorImpl(
ndim,
X_dims,
Y_dims,
[](const T a, const T b) { return a + b * b; },
T(0),
X,
Y,
context);
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
EigenVectorArrayMap<T> Y_arr(Y, Y_size);
Y_arr = Y_arr.sqrt() * alpha;
}
template <typename T>
void RowwiseMoments(
const int rows,
const int cols,
const T* X,
T* mean,
T* var) {
ConstEigenArrayMap<T> X_arr(X, cols, rows);
EigenVectorArrayMap<T> mean_arr(mean, rows);
EigenVectorArrayMap<T> var_arr(var, rows);
@ -26,15 +411,19 @@ RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
}
template <typename T>
C10_EXPORT void
ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
std::memset(mean, 0, sizeof(T) * cols);
std::memset(var, 0, sizeof(T) * cols);
void ColwiseMoments(
const int rows,
const int cols,
const T* X,
T* mean,
T* var) {
ConstEigenArrayMap<T> X_arr(X, cols, rows);
EigenVectorArrayMap<T> mean_arr(mean, cols);
EigenVectorArrayMap<T> var_arr(var, cols);
// Eigen rowwise reduction is about 10 times slower than this for-loop.
for (int i = 0; i < rows; ++i) {
mean_arr = X_arr.col(0);
var_arr = X_arr.col(0).square();
for (int i = 1; i < rows; ++i) {
mean_arr += X_arr.col(i);
var_arr += X_arr.col(i).square();
}
@ -44,32 +433,30 @@ ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
}
template <typename T>
C10_EXPORT void BothEndsMoments(
const int pre,
const int mid,
const int nxt,
void BothEndsMoments(
const int M,
const int N,
const int K,
const T* X,
T* mean,
T* var) {
std::memset(mean, 0, sizeof(T) * mid);
std::memset(var, 0, sizeof(T) * mid);
EigenVectorArrayMap<T> mean_arr(mean, mid);
EigenVectorArrayMap<T> var_arr(var, mid);
ConstEigenArrayMap<T> X_arr(X, nxt, pre * mid);
for (int i = 0; i < pre; ++i) {
for (int j = 0; j < mid; ++j) {
const int c = i * mid + j;
mean_arr(j) += X_arr.col(c).sum();
var_arr(j) += X_arr.col(c).square().sum();
}
EigenVectorArrayMap<T> mean_arr(mean, N);
EigenVectorArrayMap<T> var_arr(var, N);
ConstEigenArrayMap<T> X0_arr(X, K, N);
mean_arr = X0_arr.colwise().sum();
var_arr = X0_arr.square().colwise().sum();
for (int i = 1; i < M; ++i) {
ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
mean_arr += X_arr.colwise().sum();
var_arr += X_arr.square().colwise().sum();
}
const T scale = T(1) / static_cast<T>(pre * nxt);
const T scale = T(1) / static_cast<T>(M * K);
mean_arr *= scale;
var_arr = var_arr * scale - mean_arr.square();
}
template <typename T>
C10_EXPORT void MomentsImpl(
void MomentsImpl(
const int ndim,
const int* X_dims,
const int* Y_dims,
@ -126,6 +513,128 @@ C10_EXPORT void MomentsImpl(
} // namespace
#define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, \
const T* X, \
T* Y, \
Tensor* /* scratch_ptr */, \
CPUContext* /* context */) { \
*Y = ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \
}
DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMin, minCoeff)
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMin, minCoeff)
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMin, minCoeff)
DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMax, maxCoeff)
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMax, maxCoeff)
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMax, maxCoeff)
#undef DELEGATE_GLOBAL_REDUCE_FUNCTION
#define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int ndim, \
const int* X_dims, \
const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
const int X_size = \
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); \
const int Y_size = \
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); \
if (X_size == 0) { \
Set<T, CPUContext>(Y_size, alpha * kInit, Y, context); \
return; \
} \
if (alpha == T(0)) { \
std::memset(Y, 0, sizeof(T) * Y_size); \
return; \
} \
if (std::equal(X_dims, X_dims + ndim, Y_dims)) { \
if (kIsNorm) { \
EigenVectorArrayMap<T>(Y, Y_size) = \
ConstEigenVectorArrayMap<T>(X, X_size).abs() * alpha; \
} else { \
Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
} \
return; \
} \
int rows; \
int cols; \
if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
Rowwise##Func<T>(rows, cols, alpha, X, Y, context); \
return; \
} \
if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
Colwise##Func<T>(rows, cols, alpha, X, Y, context); \
return; \
} \
int M; \
int N; \
int K; \
if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) { \
BothEnds##Func<T>(M, N, K, alpha, X, Y, context); \
return; \
} \
Func##Impl<T>(ndim, X_dims, Y_dims, alpha, X, Y, context); \
}
DELEGATE_REDUCE_FUNCTION(
float,
ReduceMin,
std::numeric_limits<float>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
double,
ReduceMin,
std::numeric_limits<double>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceMin,
std::numeric_limits<std::int32_t>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceMin,
std::numeric_limits<std::int64_t>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
float,
ReduceMax,
std::numeric_limits<float>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
double,
ReduceMax,
std::numeric_limits<double>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceMax,
std::numeric_limits<std::int32_t>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceMax,
std::numeric_limits<std::int64_t>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(float, ReduceSum, 0.0f, false)
DELEGATE_REDUCE_FUNCTION(double, ReduceSum, 0.0, false)
DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceSum, 0, false)
DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceSum, 0LL, false)
DELEGATE_REDUCE_FUNCTION(float, ReduceMean, 0.0f, false)
DELEGATE_REDUCE_FUNCTION(double, ReduceMean, 0.0, false)
DELEGATE_REDUCE_FUNCTION(float, ReduceL1, 0.0f, true)
DELEGATE_REDUCE_FUNCTION(double, ReduceL1, 0.0, true)
DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceL1, 0, true)
DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceL1, 0LL, true)
DELEGATE_REDUCE_FUNCTION(float, ReduceL2, 0.0f, true)
DELEGATE_REDUCE_FUNCTION(double, ReduceL2, 0.0, true)
#undef DELEGATE_REDUCE_FUNCTION
#define CAFFE2_SPECIALIZED_MOMENTS(T) \
template <> \
C10_EXPORT void Moments<T, CPUContext>( \

View File

@ -5,8 +5,90 @@
#include "caffe2/core/types.h"
namespace caffe2 {
class Tensor;
namespace math {
template <typename T, class Context>
CAFFE2_API void
ReduceMin(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
template <typename T, class Context>
CAFFE2_API void
ReduceMax(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
// In all of the reduce functions, X_dims and Y_dims should have ndim elements.
// Each dimension of Y_dims must match the corresponding dimension of X_dims or
// must be equal to 1. The dimensions equal to 1 indicate the dimensions of X to
// be reduced.
// Y = alpha * ReduceMin(X)
template <typename T, class Context>
CAFFE2_API void ReduceMin(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
Context* context);
// Y = alpha * ReduceMax(X)
template <typename T, class Context>
CAFFE2_API void ReduceMax(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
Context* context);
// Y = alpha * ReduceSum(X)
template <typename T, class Context>
CAFFE2_API void ReduceSum(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
Context* context);
// Y = alpha * ReduceMean(X)
template <typename T, class Context>
CAFFE2_API void ReduceMean(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
Context* context);
// Y = alpha * ReduceL1(X)
template <typename T, class Context>
CAFFE2_API void ReduceL1(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
Context* context);
// Y = alpha * ReduceL2(X)
template <typename T, class Context>
CAFFE2_API void ReduceL2(
const int ndim,
const int* X_dims,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
Context* context);
// Computes mean and variance over axes.
template <typename T, class Context>
CAFFE2_API void Moments(
@ -19,6 +101,7 @@ CAFFE2_API void Moments(
Context* context);
} // namespace math
} // namespace caffe2
#endif // CAFFE2_UTILS_MATH_REDUCE_H_

View File

@ -375,40 +375,6 @@ C10_EXPORT void Gemv<float, CPUContext>(
cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
}
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha alpha, \
const TData* x, \
TData* y, \
CPUContext*) { \
if (y != x) { \
cblas_##prefix##copy(n, x, 1, y, 1); \
} \
if (alpha != TAlpha(1)) { \
cblas_##prefix##scal(n, static_cast<TData>(alpha), y, 1); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha* alpha, \
const TData* x, \
TData* y, \
CPUContext*) { \
if (y != x) { \
cblas_##prefix##copy(n, x, 1, y, 1); \
} \
if (*alpha != TAlpha(1)) { \
cblas_##prefix##scal(n, static_cast<TData>(*alpha), y, 1); \
} \
}
CAFFE2_SPECIALIZED_SCALE(float, float, s)
CAFFE2_SPECIALIZED_SCALE(double, double, d)
CAFFE2_SPECIALIZED_SCALE(float, double, d)
#undef CAFFE2_SPECIALIZED_SCALE
#define CAFFE2_SPECIALIZED_DOT(T, prefix) \
template <> \
C10_EXPORT void Dot<T, CPUContext>( \
@ -486,36 +452,6 @@ CAFFE2_SPECIALIZED_AXPBY(float, s)
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha alpha, \
const TData* x, \
TData* y, \
CPUContext* /* context */) { \
EigenVectorMap<TData>(y, n) = \
ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha); \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha* alpha, \
const TData* x, \
TData* y, \
CPUContext* /* context */) { \
EigenVectorMap<TData>(y, n) = \
ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(*alpha); \
}
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
CAFFE2_SPECIALIZED_SCALE(float, float)
CAFFE2_SPECIALIZED_SCALE(double, double)
CAFFE2_SPECIALIZED_SCALE(float, double)
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
#undef CAFFE2_SPECIALIZED_SCALE
template <>
C10_EXPORT void GemmBatched<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
@ -628,563 +564,6 @@ C10_EXPORT void GemmStridedBatched<float, CPUContext>(
// Eigen or via custom code.
////////////////////////////////////////////////////////////////////////////////
#define CAFFE2_SPECIALIZED_SET(T) \
template <> \
C10_EXPORT void Set<T, CPUContext>( \
const size_t N, const T alpha, T* Y, CPUContext*) { \
if (N == 0) { \
return; \
} \
if (alpha == (T)0) { \
if (Y != nullptr) { \
std::memset(Y, 0, N * sizeof(T)); \
} \
} else { \
EigenVectorMap<T>(Y, N).setConstant(alpha); \
} \
}
CAFFE2_SPECIALIZED_SET(float);
CAFFE2_SPECIALIZED_SET(double);
CAFFE2_SPECIALIZED_SET(int8_t);
CAFFE2_SPECIALIZED_SET(int16_t);
CAFFE2_SPECIALIZED_SET(int);
CAFFE2_SPECIALIZED_SET(int64_t);
CAFFE2_SPECIALIZED_SET(bool);
CAFFE2_SPECIALIZED_SET(char);
CAFFE2_SPECIALIZED_SET(uint8_t);
CAFFE2_SPECIALIZED_SET(uint16_t);
#undef CAFFE2_SPECIALIZED_SET
#define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
template <> \
C10_EXPORT void ReduceMin<T, CPUContext>( \
const int N, \
const T* x, \
T* y, \
Tensor* /*scratch_ptr*/, \
CPUContext* /*context*/) { \
*y = ConstEigenVectorArrayMap<T>(x, N).minCoeff(); \
}
CAFFE2_SPECIALIZED_REDUCEMIN(float)
#undef CAFFE2_SPECIALIZED_REDUCEMIN
#define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
template <> \
C10_EXPORT void ReduceMax<T, CPUContext>( \
const int N, \
const T* x, \
T* y, \
Tensor* /*scratch_ptr*/, \
CPUContext* /*context*/) { \
*y = ConstEigenVectorArrayMap<T>(x, N).maxCoeff(); \
}
CAFFE2_SPECIALIZED_REDUCEMAX(float)
CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
#undef CAFFE2_SPECIALIZED_REDUCEMAX
namespace {
template <typename T>
struct MinFunctor {
inline T operator()(const T a, const T b) const {
return std::min(a, b);
}
};
template <typename T>
struct MaxFunctor {
inline T operator()(const T a, const T b) const {
return std::max(a, b);
}
};
template <typename T>
struct L1NormFunctor {
inline T operator()(const T a, const T b) const {
return a + std::abs(b);
}
};
template <typename T>
struct SquaredL2NormFunctor {
inline T operator()(const T a, const T b) const {
return a + b * b;
}
};
#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp) \
template <typename T> \
C10_EXPORT void Rowwise##Func( \
const int rows, const int cols, const T alpha, const T* X, T* Y) { \
EigenVectorMap<T>(Y, rows) = \
ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \
}
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp) \
template <typename T> \
C10_EXPORT void Colwise##Func( \
const int rows, const int cols, const T alpha, const T* X, T* Y) { \
EigenVectorMap<T>(Y, cols) = \
ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \
}
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, sum)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMean, mean)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm)
#undef DELEGATE_COLWISE_REDUCE_FUNCTION
template <typename T>
C10_EXPORT void BothEndsReduceMin(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().minCoeff();
const T* X_ptr = X + mid * nxt;
// It seems there is some bug in eigen array::min so it cannot be implemented
// as ReduceSum below.
for (int i = 1; i < pre; ++i) {
for (int j = 0; j < mid; ++j) {
Y[j] = std::min(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).minCoeff());
X_ptr += nxt;
}
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
C10_EXPORT void BothEndsReduceMax(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().maxCoeff();
const T* X_ptr = X + mid * nxt;
for (int i = 1; i < pre; ++i) {
for (int j = 0; j < mid; ++j) {
Y[j] = std::max(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).maxCoeff());
X_ptr += nxt;
}
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
C10_EXPORT void BothEndsReduceSum(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().sum();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().sum();
X_ptr += stride;
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
C10_EXPORT void BothEndsReduceMean(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().mean();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().mean();
X_ptr += stride;
}
if (alpha / static_cast<T>(pre) != 1) {
Y_arr *= alpha / static_cast<T>(pre);
}
}
template <typename T>
C10_EXPORT void BothEndsReduceL1(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid)
.colwise()
.template lpNorm<1>()
.array();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr += ConstEigenMatrixMap<T>(X_ptr, nxt, mid)
.colwise()
.template lpNorm<1>()
.array();
X_ptr += stride;
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
C10_EXPORT void BothEndsReduceL2(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid).colwise().squaredNorm().array();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr +=
ConstEigenMatrixMap<T>(X_ptr, nxt, mid).colwise().squaredNorm().array();
X_ptr += stride;
}
Y_arr = Y_arr.sqrt() * alpha;
}
template <typename T, class Reducer>
C10_EXPORT void ReduceTensor(
const int ndim,
const int* X_dims,
const int* Y_dims,
const Reducer& reducer,
const T init,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
const int X_size =
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Set<T, CPUContext>(Y_size, init, Y, context);
std::vector<int> index(ndim, 0);
for (int X_index = 0; X_index < X_size; ++X_index) {
const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
Y[Y_index] = reducer(Y[Y_index], X[X_index]);
utils::IncreaseIndexInDims(ndim, X_dims, index.data());
}
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
}
} // namespace
#define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
CAFFE_ENFORCE_LE(num_axes, num_dims); \
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims_vector[axes[i]] = 1; \
} \
const int* X_dims = dims; \
const int* Y_dims = Y_dims_vector.data(); \
const int X_size = \
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
const int Y_size = \
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
if (X_size == 0) { \
Set<T, CPUContext>(Y_size, alpha * init, Y, context); \
return; \
} \
if (alpha == T(0)) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
if (is_norm) { \
Abs<T, CPUContext>(X_size, X, Y, context); \
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
} else { \
Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
} \
return; \
} \
int rows; \
int cols; \
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
Rowwise##Func<T>(rows, cols, alpha, X, Y); \
return; \
} \
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
Colwise##Func<T>(rows, cols, alpha, X, Y); \
return; \
} \
int pre; \
int mid; \
int nxt; \
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
BothEnds##Func<T>(pre, mid, nxt, alpha, X, Y); \
return; \
} \
ReduceTensor( \
num_dims, X_dims, Y_dims, reducer, init, alpha, X, Y, context); \
}
DELEGATE_REDUCE_FUNCTION(
float,
ReduceMin,
MinFunctor<float>(),
std::numeric_limits<float>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
double,
ReduceMin,
MinFunctor<double>(),
std::numeric_limits<double>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceMin,
MinFunctor<std::int32_t>(),
std::numeric_limits<std::int32_t>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceMin,
MinFunctor<std::int64_t>(),
std::numeric_limits<std::int64_t>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
float,
ReduceMax,
MaxFunctor<float>(),
std::numeric_limits<float>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
double,
ReduceMax,
MaxFunctor<double>(),
std::numeric_limits<double>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceMax,
MaxFunctor<std::int32_t>(),
std::numeric_limits<std::int32_t>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceMax,
MaxFunctor<std::int64_t>(),
std::numeric_limits<std::int64_t>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(float, ReduceSum, std::plus<float>(), 0.0f, false)
DELEGATE_REDUCE_FUNCTION(double, ReduceSum, std::plus<double>(), 0.0, false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceSum,
std::plus<std::int32_t>(),
0,
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceSum,
std::plus<std::int64_t>(),
std::int64_t(0),
false)
DELEGATE_REDUCE_FUNCTION(float, ReduceL1, L1NormFunctor<float>(), 0.0f, true)
DELEGATE_REDUCE_FUNCTION(double, ReduceL1, L1NormFunctor<double>(), 0.0, true)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceL1,
L1NormFunctor<std::int32_t>(),
0,
true)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceL1,
L1NormFunctor<std::int64_t>(),
std::int64_t(0),
true)
#undef DELEGATE_REDUCE_FUNCTION
#define CAFFE2_SPECIALIZED_REDUCE_MEAN(T) \
template <> \
C10_EXPORT void ReduceMean<T, CPUContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
CAFFE_ENFORCE_LE(num_axes, num_dims); \
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims_vector[axes[i]] = 1; \
} \
const int* X_dims = dims; \
const int* Y_dims = Y_dims_vector.data(); \
const int X_size = \
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
const int Y_size = \
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
if (X_size == 0) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (alpha == T(0)) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
Scale<T, T, CPUContext>(X_size, alpha, X, Y, context); \
return; \
} \
int rows; \
int cols; \
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
RowwiseReduceMean<T>(rows, cols, alpha, X, Y); \
return; \
} \
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
ColwiseReduceMean<T>(rows, cols, alpha, X, Y); \
return; \
} \
int pre; \
int mid; \
int nxt; \
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
BothEndsReduceMean<T>(pre, mid, nxt, alpha, X, Y); \
return; \
} \
const int scale = X_size / Y_size; \
ReduceTensor( \
num_dims, \
X_dims, \
Y_dims, \
std::plus<T>(), \
T(0), \
alpha / static_cast<T>(scale), \
X, \
Y, \
context); \
}
CAFFE2_SPECIALIZED_REDUCE_MEAN(float)
CAFFE2_SPECIALIZED_REDUCE_MEAN(double)
#undef CAFFE2_SPECIALIZED_REDUCE_MEAN
#define CAFFE2_SPECIALIZED_REDUCE_L2(T) \
template <> \
C10_EXPORT void ReduceL2<T, CPUContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
CAFFE_ENFORCE_LE(num_axes, num_dims); \
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims_vector[axes[i]] = 1; \
} \
const int* X_dims = dims; \
const int* Y_dims = Y_dims_vector.data(); \
const int X_size = \
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
const int Y_size = \
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
if (X_size == 0) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (alpha == T(0)) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
Abs<T, CPUContext>(X_size, X, Y, context); \
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
return; \
} \
int rows; \
int cols; \
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
RowwiseReduceL2<T>(rows, cols, alpha, X, Y); \
return; \
} \
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
ColwiseReduceL2<T>(rows, cols, alpha, X, Y); \
return; \
} \
int pre; \
int mid; \
int nxt; \
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
BothEndsReduceL2<T>(pre, mid, nxt, alpha, X, Y); \
return; \
} \
ReduceTensor( \
num_dims, \
X_dims, \
Y_dims, \
SquaredL2NormFunctor<T>(), \
T(0), \
T(1), \
X, \
Y, \
context); \
Sqrt<T, CPUContext>(Y_size, Y, Y, context); \
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
}
CAFFE2_SPECIALIZED_REDUCE_L2(float)
CAFFE2_SPECIALIZED_REDUCE_L2(double)
#undef CAFFE2_SPECIALIZED_REDUCE_L2
namespace {
template <typename T>

View File

@ -1482,7 +1482,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) {
#define CAFFE2_SPECIALIZED_CUDA_SET(T) \
template <> \
CAFFE2_CUDA_API void Set<T, CUDAContext>( \
const size_t N, const T alpha, T* Y, CUDAContext* context) { \
const int N, const T alpha, T* Y, CUDAContext* context) { \
if (N == 0) { \
return; \
} \
@ -1510,7 +1510,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
template <>
CAFFE2_CUDA_EXPORT void Set<at::Half, CUDAContext>(
const size_t N,
const int N,
const at::Half alpha,
at::Half* Y,
CUDAContext* context) {
@ -3356,27 +3356,19 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl(
template <typename T, class Reducer>
CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const int ndim,
const int* X_dims,
const int* Y_dims,
const Reducer& reducer,
const T init,
const T alpha,
const T* X,
T* Y,
CUDAContext* context) {
CAFFE_ENFORCE_LE(num_axes, num_dims);
std::vector<int> Y_dims_vector(dims, dims + num_dims);
for (int i = 0; i < num_axes; ++i) {
Y_dims_vector[axes[i]] = 1;
}
const int* X_dims = dims;
const int* Y_dims = Y_dims_vector.data();
const int X_size =
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>());
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
const int Y_size =
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>());
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
if (X_size == 0) {
Set<T, CUDAContext>(Y_size, alpha * init, Y, context);
return;
@ -3385,13 +3377,13 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
Set<T, CUDAContext>(Y_size, T(0), Y, context);
return;
}
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {
if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
Scale<T, T, CUDAContext>(X_size, alpha, X, Y, context);
return;
}
int rows;
int cols;
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
RowwiseReduceKernel<T>
<<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
@ -3399,7 +3391,7 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
return;
}
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
ColwiseReduceKernel<T>
<<<std::min(cols, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
@ -3407,20 +3399,19 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
return;
}
std::vector<int> transpose_axes(num_dims);
utils::ComputeTransposeAxesForReduceOp(
num_dims, num_axes, axes, transpose_axes.data());
std::vector<int> axes(ndim);
utils::ComputeTransposeAxesForReduceOp(ndim, Y_dims, axes.data());
const int outer_size = Y_size;
const int inner_size = X_size / Y_size;
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
num_dims,
ndim,
ReduceTensorCUDAImpl,
T,
Reducer,
outer_size,
inner_size,
dims,
transpose_axes.data(),
X_dims,
axes.data(),
reducer,
init,
alpha,
@ -3434,19 +3425,17 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceMin<T, CUDAContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const int ndim, \
const int* X_dims, \
const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
ReduceTensorCUDA( \
num_dims, \
dims, \
num_axes, \
axes, \
ndim, \
X_dims, \
Y_dims, \
cub::Min(), \
std::numeric_limits<T>::max(), \
alpha, \
@ -3463,19 +3452,17 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceMax<T, CUDAContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const int ndim, \
const int* X_dims, \
const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
ReduceTensorCUDA( \
num_dims, \
dims, \
num_axes, \
axes, \
ndim, \
X_dims, \
Y_dims, \
cub::Max(), \
std::numeric_limits<T>::lowest(), \
alpha, \
@ -3489,28 +3476,18 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float)
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
ReduceTensorCUDA( \
num_dims, \
dims, \
num_axes, \
axes, \
cub::Sum(), \
T(0), \
alpha, \
X, \
Y, \
context); \
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>( \
const int ndim, \
const int* X_dims, \
const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
ReduceTensorCUDA( \
ndim, X_dims, Y_dims, cub::Sum(), T(0), alpha, X, Y, context); \
}
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
@ -3521,23 +3498,23 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \
template <> \
CAFFE2_CUDA_EXPORT void ReduceMean<T, CUDAContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const int ndim, \
const int* X_dims, \
const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CUDAContext* context) { \
int scale = 1; \
for (int i = 0; i < num_axes; ++i) { \
scale *= dims[axes[i]]; \
for (int i = 0; i < ndim; ++i) { \
if (Y_dims[i] == 1) { \
scale *= X_dims[i]; \
} \
} \
ReduceTensorCUDA( \
num_dims, \
dims, \
num_axes, \
axes, \
ndim, \
X_dims, \
Y_dims, \
cub::Sum(), \
T(0), \
alpha / static_cast<T>(scale), \

View File

@ -351,288 +351,6 @@ INSTANTIATE_TEST_CASE_P(
GemmBatchedGPUTest,
testing::Combine(testing::Bool(), testing::Bool()));
class ReduceTensorGPUTest : public testing::Test {
protected:
void SetUp() override {
if (!HasCudaGPU()) {
return;
}
option_.set_device_type(PROTO_CUDA);
cuda_context_ = make_unique<CUDAContext>(option_);
Blob* blob_x = ws_.CreateBlob("X");
Blob* blob_y = ws_.CreateBlob("Y");
X_ = BlobGetMutableTensor(blob_x, CUDA);
Y_ = BlobGetMutableTensor(blob_y, CUDA);
}
void SetUpData(
const std::vector<int>& X_dims,
const std::vector<int>& axes,
const std::vector<float>& X_data) {
std::vector<int> Y_dims = X_dims;
for (const int axis : axes) {
Y_dims[axis] = 1;
}
X_->Resize(X_dims);
Y_->Resize(Y_dims);
ASSERT_EQ(X_data.size(), X_->numel());
cuda_context_->CopyFromCPU<float>(
X_data.size(), X_data.data(), X_->mutable_data<float>());
}
void VerifyResult(const std::vector<float>& expected_output) {
Blob* blob_y_host = ws_.CreateBlob("Y_host");
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
Y_host->CopyFrom(*Y_);
ASSERT_EQ(expected_output.size(), Y_host->numel());
for (std::size_t i = 0; i < expected_output.size(); ++i) {
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
}
}
template <class ReduceFunc>
void RunRedcueTensorTest(
const ReduceFunc& reduce_func,
const std::vector<int>& X_dims,
const std::vector<int>& axes,
const std::vector<float>& X_data,
const std::vector<float>& Y_data) {
SetUpData(X_dims, axes, X_data);
reduce_func(
X_dims.size(),
X_dims.data(),
axes.size(),
axes.data(),
1.0f,
X_->data<float>(),
Y_->mutable_data<float>(),
cuda_context_.get());
VerifyResult(Y_data);
}
Workspace ws_;
DeviceOption option_;
std::unique_ptr<CUDAContext> cuda_context_;
Tensor* X_ = nullptr;
Tensor* Y_ = nullptr;
};
TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
if (!HasCudaGPU()) {
return;
}
const auto& reduce_min = [](const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const float alpha,
const float* X,
float* Y,
CUDAContext* context) {
return math::ReduceMin<float, CUDAContext>(
num_dims, dims, num_axes, axes, alpha, X, Y, context);
};
// Test for 1D tensor.
RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
reduce_min,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{1.0f, 4.0f});
RunRedcueTensorTest(
reduce_min,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{1.0f, 2.0f, 3.0f});
RunRedcueTensorTest(
reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
// Test for 3D tensor.
RunRedcueTensorTest(
reduce_min,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{1.0f, 5.0f});
RunRedcueTensorTest(
reduce_min,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{1.0f, 2.0f});
RunRedcueTensorTest(
reduce_min,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{1.0f, 3.0f});
}
TEST_F(ReduceTensorGPUTest, ReduceMaxGPUTest) {
if (!HasCudaGPU()) {
return;
}
const auto& reduce_max = [](const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const float alpha,
const float* X,
float* Y,
CUDAContext* context) {
return math::ReduceMax<float, CUDAContext>(
num_dims, dims, num_axes, axes, alpha, X, Y, context);
};
// Test for 1D tensor.
RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
reduce_max,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{3.0f, 6.0f});
RunRedcueTensorTest(
reduce_max,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{4.0f, 5.0f, 6.0f});
RunRedcueTensorTest(
reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
// Test for 3D tensor.
RunRedcueTensorTest(
reduce_max,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{4.0f, 8.0f});
RunRedcueTensorTest(
reduce_max,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{7.0f, 8.0f});
RunRedcueTensorTest(
reduce_max,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{6.0f, 8.0f});
}
TEST_F(ReduceTensorGPUTest, ReduceSumGPUTest) {
if (!HasCudaGPU()) {
return;
}
// Test for 1D tensor.
RunRedcueTensorTest(
math::ReduceSum<float, CUDAContext>,
{3},
{0},
{1.0f, 2.0f, 3.0f},
{6.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
math::ReduceSum<float, CUDAContext>,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{6.0f, 15.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CUDAContext>,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{5.0f, 7.0f, 9.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CUDAContext>,
{2, 3},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{21.0f});
// Test for 3D tensor.
RunRedcueTensorTest(
math::ReduceSum<float, CUDAContext>,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{10.0f, 26.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CUDAContext>,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{16.0f, 20.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CUDAContext>,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{14.0f, 22.0f});
}
TEST_F(ReduceTensorGPUTest, ReduceMeanGPUTest) {
if (!HasCudaGPU()) {
return;
}
// Test for 1D tensor.
RunRedcueTensorTest(
math::ReduceMean<float, CUDAContext>,
{3},
{0},
{1.0f, 2.0f, 3.0f},
{2.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
math::ReduceMean<float, CUDAContext>,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{2.0f, 5.0f});
RunRedcueTensorTest(
math::ReduceMean<float, CUDAContext>,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{2.5f, 3.5f, 4.5f});
RunRedcueTensorTest(
math::ReduceMean<float, CUDAContext>,
{2, 3},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{3.5f});
// Test for 3D tensor.
RunRedcueTensorTest(
math::ReduceMean<float, CUDAContext>,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{2.5f, 6.5f});
RunRedcueTensorTest(
math::ReduceMean<float, CUDAContext>,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{4.0f, 5.0f});
RunRedcueTensorTest(
math::ReduceMean<float, CUDAContext>,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{3.5f, 5.5f});
}
class BroadcastGPUTest : public testing::Test {
protected:
void SetUp() override {

View File

@ -426,253 +426,6 @@ TEST(MathTest, FloatToHalfConversion) {
namespace {
class ReduceTensorTest : public testing::Test {
protected:
void SetUp() override {
cpu_context_ = make_unique<CPUContext>(option_);
}
template <class ReduceFunc>
void RunRedcueTensorTest(
const ReduceFunc& reduce_func,
const std::vector<int>& X_dims,
const std::vector<int>& axes,
const std::vector<float>& X_data,
const std::vector<float>& Y_data) {
std::vector<int> Y_dims = X_dims;
for (const int axis : axes) {
Y_dims[axis] = 1;
}
std::vector<int64_t> X_dims_64;
std::vector<int64_t> Y_dims_64;
std::copy(X_dims.cbegin(), X_dims.cend(), std::back_inserter(X_dims_64));
std::copy(Y_dims.cbegin(), Y_dims.cend(), std::back_inserter(Y_dims_64));
ReinitializeTensor(&X_, X_dims_64, at::dtype<float>().device(CPU));
ReinitializeTensor(&Y_, Y_dims_64, at::dtype<float>().device(CPU));
ASSERT_EQ(X_data.size(), X_.numel());
cpu_context_->CopyFromCPU<float>(
X_data.size(), X_data.data(), X_.mutable_data<float>());
reduce_func(
X_dims.size(),
X_dims.data(),
axes.size(),
axes.data(),
1.0f,
X_.data<float>(),
Y_.mutable_data<float>(),
cpu_context_.get());
ASSERT_EQ(Y_data.size(), Y_.numel());
for (int i = 0; i < Y_.numel(); ++i) {
EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
}
}
DeviceOption option_;
std::unique_ptr<CPUContext> cpu_context_;
Tensor X_;
Tensor Y_;
};
TEST_F(ReduceTensorTest, ReduceMinTest) {
const auto& reduce_min = [](const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const float alpha,
const float* X,
float* Y,
CPUContext* context) {
return math::ReduceMin<float, CPUContext>(
num_dims, dims, num_axes, axes, alpha, X, Y, context);
};
// Test for 1D tensor.
RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
reduce_min,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{1.0f, 4.0f});
RunRedcueTensorTest(
reduce_min,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{1.0f, 2.0f, 3.0f});
RunRedcueTensorTest(
reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
// Test for 3D tensor.
RunRedcueTensorTest(
reduce_min,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{1.0f, 5.0f});
RunRedcueTensorTest(
reduce_min,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{1.0f, 2.0f});
RunRedcueTensorTest(
reduce_min,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{1.0f, 3.0f});
}
TEST_F(ReduceTensorTest, ReduceMaxTest) {
const auto& reduce_max = [](const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const float alpha,
const float* X,
float* Y,
CPUContext* context) {
return math::ReduceMax<float, CPUContext>(
num_dims, dims, num_axes, axes, alpha, X, Y, context);
};
// Test for 1D tensor.
RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
reduce_max,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{3.0f, 6.0f});
RunRedcueTensorTest(
reduce_max,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{4.0f, 5.0f, 6.0f});
RunRedcueTensorTest(
reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
// Test for 3D tensor.
RunRedcueTensorTest(
reduce_max,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{4.0f, 8.0f});
RunRedcueTensorTest(
reduce_max,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{7.0f, 8.0f});
RunRedcueTensorTest(
reduce_max,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{6.0f, 8.0f});
}
TEST_F(ReduceTensorTest, ReduceSumTest) {
// Test for 1D tensor.
RunRedcueTensorTest(
math::ReduceSum<float, CPUContext>, {3}, {0}, {1.0f, 2.0f, 3.0f}, {6.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
math::ReduceSum<float, CPUContext>,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{6.0f, 15.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CPUContext>,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{5.0f, 7.0f, 9.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CPUContext>,
{2, 3},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{21.0f});
// Test for 3D tensor.
RunRedcueTensorTest(
math::ReduceSum<float, CPUContext>,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{10.0f, 26.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CPUContext>,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{16.0f, 20.0f});
RunRedcueTensorTest(
math::ReduceSum<float, CPUContext>,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{14.0f, 22.0f});
}
TEST_F(ReduceTensorTest, ReduceMeanTest) {
// Test for 1D tensor.
RunRedcueTensorTest(
math::ReduceMean<float, CPUContext>,
{3},
{0},
{1.0f, 2.0f, 3.0f},
{2.0f});
// Test for 2D Tensor.
RunRedcueTensorTest(
math::ReduceMean<float, CPUContext>,
{2, 3},
{1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{2.0f, 5.0f});
RunRedcueTensorTest(
math::ReduceMean<float, CPUContext>,
{2, 3},
{0},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{2.5f, 3.5f, 4.5f});
RunRedcueTensorTest(
math::ReduceMean<float, CPUContext>,
{2, 3},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
{3.5f});
// Test for 3D tensor.
RunRedcueTensorTest(
math::ReduceMean<float, CPUContext>,
{2, 2, 2},
{1, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{2.5f, 6.5f});
RunRedcueTensorTest(
math::ReduceMean<float, CPUContext>,
{2, 2, 2},
{0, 1},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{4.0f, 5.0f});
RunRedcueTensorTest(
math::ReduceMean<float, CPUContext>,
{2, 2, 2},
{0, 2},
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
{3.5f, 5.5f});
}
class BroadcastTest : public testing::Test {
protected:
void SetUp() override {