mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
Separate reduce functions from math (#16929)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16929 Separate CPU reduce functions from math i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D13999469 fbshipit-source-id: bd628b15a6e3c1f04cc62aefffb0110690e1c0d1
This commit is contained in:
parent
9b7f3da74b
commit
3a34f443c5
|
|
@ -14,7 +14,7 @@ OPERATOR_SCHEMA(ATen);
|
|||
namespace math {
|
||||
template <>
|
||||
void Set<at::Half, CPUContext>(
|
||||
const size_t /*N*/,
|
||||
const int /*N*/,
|
||||
const at::Half h,
|
||||
at::Half* v,
|
||||
CPUContext* c) {
|
||||
|
|
|
|||
|
|
@ -47,15 +47,14 @@ struct AddFunctor {
|
|||
const std::vector<int> C_dims =
|
||||
elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
|
||||
A_dims, B_dims);
|
||||
std::vector<int> A_axes;
|
||||
std::vector<int> B_axes;
|
||||
elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
|
||||
A_dims, B_dims, &A_axes, &B_axes);
|
||||
std::vector<int> A_back_dims;
|
||||
std::vector<int> B_back_dims;
|
||||
elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
|
||||
A_dims, B_dims, &A_back_dims, &B_back_dims);
|
||||
math::ReduceSum(
|
||||
C_dims.size(),
|
||||
C_dims.data(),
|
||||
A_axes.size(),
|
||||
A_axes.data(),
|
||||
A_back_dims.data(),
|
||||
TGrad(1),
|
||||
dC,
|
||||
dA,
|
||||
|
|
@ -63,8 +62,7 @@ struct AddFunctor {
|
|||
math::ReduceSum(
|
||||
C_dims.size(),
|
||||
C_dims.data(),
|
||||
B_axes.size(),
|
||||
B_axes.data(),
|
||||
B_back_dims.data(),
|
||||
TGrad(1),
|
||||
dC,
|
||||
dB,
|
||||
|
|
|
|||
|
|
@ -108,5 +108,17 @@ void ComputeBinaryBroadcastBackwardAxes(
|
|||
std::reverse(B_axes->begin(), B_axes->end());
|
||||
}
|
||||
|
||||
void ComputeBinaryBroadcastBackwardDims(
|
||||
const std::vector<int>& A_dims,
|
||||
const std::vector<int>& B_dims,
|
||||
std::vector<int>* A_back_dims,
|
||||
std::vector<int>* B_back_dims) {
|
||||
const int ndim = std::max(A_dims.size(), B_dims.size());
|
||||
A_back_dims->assign(ndim, 1);
|
||||
B_back_dims->assign(ndim, 1);
|
||||
std::copy(A_dims.crbegin(), A_dims.crend(), A_back_dims->rbegin());
|
||||
std::copy(B_dims.crbegin(), B_dims.crend(), B_back_dims->rbegin());
|
||||
}
|
||||
|
||||
} // namespace elementwise_ops_utils
|
||||
} // namespace caffe2
|
||||
|
|
|
|||
|
|
@ -23,6 +23,12 @@ CAFFE2_API void ComputeBinaryBroadcastBackwardAxes(
|
|||
std::vector<int>* A_axes,
|
||||
std::vector<int>* B_axes);
|
||||
|
||||
CAFFE2_API void ComputeBinaryBroadcastBackwardDims(
|
||||
const std::vector<int>& A_dims,
|
||||
const std::vector<int>& B_dims,
|
||||
std::vector<int>* A_back_dims,
|
||||
std::vector<int>* B_back_dims);
|
||||
|
||||
} // namespace elementwise_ops_utils
|
||||
} // namespace caffe2
|
||||
|
||||
|
|
|
|||
|
|
@ -47,15 +47,14 @@ struct SubFunctor {
|
|||
const std::vector<int> C_dims =
|
||||
elementwise_ops_utils::ComputeBinaryBroadcastForwardDims(
|
||||
A_dims, B_dims);
|
||||
std::vector<int> A_axes;
|
||||
std::vector<int> B_axes;
|
||||
elementwise_ops_utils::ComputeBinaryBroadcastBackwardAxes(
|
||||
A_dims, B_dims, &A_axes, &B_axes);
|
||||
std::vector<int> A_back_dims;
|
||||
std::vector<int> B_back_dims;
|
||||
elementwise_ops_utils::ComputeBinaryBroadcastBackwardDims(
|
||||
A_dims, B_dims, &A_back_dims, &B_back_dims);
|
||||
math::ReduceSum(
|
||||
C_dims.size(),
|
||||
C_dims.data(),
|
||||
A_axes.size(),
|
||||
A_axes.data(),
|
||||
A_back_dims.data(),
|
||||
TGrad(1),
|
||||
dC,
|
||||
dA,
|
||||
|
|
@ -63,8 +62,7 @@ struct SubFunctor {
|
|||
math::ReduceSum(
|
||||
C_dims.size(),
|
||||
C_dims.data(),
|
||||
B_axes.size(),
|
||||
B_axes.data(),
|
||||
B_back_dims.data(),
|
||||
TGrad(-1),
|
||||
dC,
|
||||
dB,
|
||||
|
|
|
|||
|
|
@ -94,11 +94,14 @@ class ExpandGradientOp final : public Operator<Context> {
|
|||
axes.push_back(i);
|
||||
}
|
||||
}
|
||||
std::vector<int> X_dims = dY_dims;
|
||||
for (const int axis : axes) {
|
||||
X_dims[axis] = 1;
|
||||
}
|
||||
math::ReduceSum<T, Context>(
|
||||
dY_dims.size(),
|
||||
dY_dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
X_dims.data(),
|
||||
T(1),
|
||||
dY.template data<T>(),
|
||||
dX->template mutable_data<T>(),
|
||||
|
|
|
|||
|
|
@ -572,10 +572,10 @@ bool AveragePoolFunctor<CPUContext>::
|
|||
const float* X,
|
||||
float* Y,
|
||||
CPUContext* context) const {
|
||||
const std::array<int, 2> dims = {N * C, HxW};
|
||||
const int axis = 1;
|
||||
const std::array<int, 2> X_dims = {N * C, HxW};
|
||||
const std::array<int, 2> Y_dims = {N * C, 1};
|
||||
math::ReduceMean<float, CPUContext>(
|
||||
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
|
||||
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -720,10 +720,10 @@ bool MaxPoolFunctor<CPUContext>::
|
|||
const float* X,
|
||||
float* Y,
|
||||
CPUContext* context) const {
|
||||
const std::array<int, 2> dims = {N * C, HxW};
|
||||
const int axis = 1;
|
||||
const std::array<int, 2> X_dims = {N * C, HxW};
|
||||
const std::array<int, 2> Y_dims = {N * C, 1};
|
||||
math::ReduceMax<float, CPUContext>(
|
||||
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
|
||||
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -698,10 +698,10 @@ bool AveragePoolFunctor<CUDAContext>::
|
|||
const float* X,
|
||||
float* Y,
|
||||
CUDAContext* context) const {
|
||||
const std::array<int, 2> dims = {N * C, HxW};
|
||||
const int axis = 1;
|
||||
const std::array<int, 2> X_dims = {N * C, HxW};
|
||||
const std::array<int, 2> Y_dims = {N * C, 1};
|
||||
math::ReduceMean<float, CUDAContext>(
|
||||
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
|
||||
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1756,10 +1756,10 @@ bool MaxPoolFunctor<CUDAContext>::
|
|||
const float* X,
|
||||
float* Y,
|
||||
CUDAContext* context) const {
|
||||
const std::array<int, 2> dims = {N * C, HxW};
|
||||
const int axis = 1;
|
||||
const std::array<int, 2> X_dims = {N * C, HxW};
|
||||
const std::array<int, 2> Y_dims = {N * C, 1};
|
||||
math::ReduceMax<float, CUDAContext>(
|
||||
2, dims.data(), 1, &axis, 1.0f, X, Y, context);
|
||||
2, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -1773,10 +1773,10 @@ bool MaxPoolFunctor<CUDAContext>::
|
|||
const float* X,
|
||||
float* Y,
|
||||
CUDAContext* context) const {
|
||||
const std::array<int, 3> dims = {N, HxW, C};
|
||||
const int axis = 1;
|
||||
const std::array<int, 3> X_dims = {N, HxW, C};
|
||||
const std::array<int, 3> Y_dims = {N, 1, C};
|
||||
math::ReduceMax<float, CUDAContext>(
|
||||
3, dims.data(), 1, &axis, 1.0f, X, Y, context);
|
||||
3, X_dims.data(), Y_dims.data(), 1.0f, X, Y, context);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -29,13 +29,13 @@ class ReduceOp final : public Operator<Context> {
|
|||
template <typename T>
|
||||
bool DoRunWithType() {
|
||||
const auto& X = Input(0);
|
||||
|
||||
const int ndim = X.dim();
|
||||
const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
|
||||
if (axes_.empty()) {
|
||||
axes_.resize(ndim);
|
||||
std::iota(axes_.begin(), axes_.end(), 0);
|
||||
} else {
|
||||
for (auto& axis: axes_) {
|
||||
for (auto& axis : axes_) {
|
||||
axis = X.canonical_axis_index(axis);
|
||||
}
|
||||
std::sort(axes_.begin(), axes_.end());
|
||||
|
|
@ -45,24 +45,29 @@ class ReduceOp final : public Operator<Context> {
|
|||
ndim,
|
||||
"Axes ids must be smaller than the dimensions of input.");
|
||||
}
|
||||
const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
|
||||
std::vector<int64_t> Y_dims;
|
||||
Y_dims.reserve(ndim);
|
||||
std::vector<int64_t> output_dims;
|
||||
output_dims.reserve(ndim);
|
||||
std::size_t cur_axis = 0;
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
if (cur_axis < axes_.size() && i == axes_[cur_axis]) {
|
||||
if (keep_dims_) {
|
||||
Y_dims.push_back(1);
|
||||
output_dims.push_back(1);
|
||||
}
|
||||
++cur_axis;
|
||||
} else {
|
||||
Y_dims.push_back(X_dims[i]);
|
||||
output_dims.push_back(X_dims[i]);
|
||||
}
|
||||
}
|
||||
auto* Y = Output(0, Y_dims, at::dtype<T>());
|
||||
auto* Y = Output(0, output_dims, at::dtype<T>());
|
||||
|
||||
std::vector<int> Y_dims = X_dims;
|
||||
for (const int axis : axes_) {
|
||||
Y_dims[axis] = 1;
|
||||
}
|
||||
|
||||
return reducer_.template Forward<T>(
|
||||
X_dims,
|
||||
axes_,
|
||||
Y_dims,
|
||||
X.template data<T>(),
|
||||
Y->template mutable_data<T>(),
|
||||
&context_);
|
||||
|
|
@ -71,7 +76,7 @@ class ReduceOp final : public Operator<Context> {
|
|||
private:
|
||||
std::vector<int> axes_;
|
||||
const int keep_dims_;
|
||||
Reducer reducer_{};
|
||||
const Reducer reducer_{};
|
||||
};
|
||||
|
||||
template <typename InputTypes, class Context, class Reducer>
|
||||
|
|
@ -98,7 +103,7 @@ class ReduceGradientOp final : public Operator<Context> {
|
|||
axes_.resize(ndim);
|
||||
std::iota(axes_.begin(), axes_.end(), 0);
|
||||
} else {
|
||||
for (auto& axis: axes_) {
|
||||
for (auto& axis : axes_) {
|
||||
axis = X.canonical_axis_index(axis);
|
||||
}
|
||||
std::sort(axes_.begin(), axes_.end());
|
||||
|
|
@ -126,23 +131,22 @@ class ReduceGradientOp final : public Operator<Context> {
|
|||
|
||||
private:
|
||||
std::vector<int> axes_;
|
||||
Reducer reducer_{};
|
||||
const Reducer reducer_{};
|
||||
};
|
||||
|
||||
template <class Context>
|
||||
struct MinReducer {
|
||||
template <typename T>
|
||||
bool Forward(
|
||||
const std::vector<int>& dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& Y_dims,
|
||||
const T* X_data,
|
||||
T* Y_data,
|
||||
Context* context) const {
|
||||
math::ReduceMin<T, Context>(
|
||||
dims.size(),
|
||||
dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
Y_dims.data(),
|
||||
T(1),
|
||||
X_data,
|
||||
Y_data,
|
||||
|
|
@ -165,16 +169,15 @@ template <class Context>
|
|||
struct MaxReducer {
|
||||
template <typename T>
|
||||
bool Forward(
|
||||
const std::vector<int>& dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& Y_dims,
|
||||
const T* X_data,
|
||||
T* Y_data,
|
||||
Context* context) const {
|
||||
math::ReduceMax<T, Context>(
|
||||
dims.size(),
|
||||
dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
Y_dims.data(),
|
||||
T(1),
|
||||
X_data,
|
||||
Y_data,
|
||||
|
|
@ -197,16 +200,15 @@ template <class Context>
|
|||
struct SumReducer {
|
||||
template <typename T>
|
||||
bool Forward(
|
||||
const std::vector<int>& dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& Y_dims,
|
||||
const T* X_data,
|
||||
T* Y_data,
|
||||
Context* context) const {
|
||||
math::ReduceSum<T, Context>(
|
||||
dims.size(),
|
||||
dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
Y_dims.data(),
|
||||
T(1),
|
||||
X_data,
|
||||
Y_data,
|
||||
|
|
@ -240,16 +242,15 @@ template <class Context>
|
|||
struct MeanReducer {
|
||||
template <typename T>
|
||||
bool Forward(
|
||||
const std::vector<int>& dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& Y_dims,
|
||||
const T* X_data,
|
||||
T* Y_data,
|
||||
Context* context) const {
|
||||
math::ReduceMean<T, Context>(
|
||||
dims.size(),
|
||||
dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
Y_dims.data(),
|
||||
T(1),
|
||||
X_data,
|
||||
Y_data,
|
||||
|
|
@ -287,16 +288,15 @@ template <class Context>
|
|||
struct L1Reducer {
|
||||
template <typename T>
|
||||
bool Forward(
|
||||
const std::vector<int>& dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& Y_dims,
|
||||
const T* X_data,
|
||||
T* Y_data,
|
||||
Context* context) const {
|
||||
math::ReduceL1<T, Context>(
|
||||
dims.size(),
|
||||
dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
Y_dims.data(),
|
||||
T(1),
|
||||
X_data,
|
||||
Y_data,
|
||||
|
|
@ -319,16 +319,15 @@ template <class Context>
|
|||
struct L2Reducer {
|
||||
template <typename T>
|
||||
bool Forward(
|
||||
const std::vector<int>& dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& Y_dims,
|
||||
const T* X_data,
|
||||
T* Y_data,
|
||||
Context* context) const {
|
||||
math::ReduceL2<T, Context>(
|
||||
dims.size(),
|
||||
dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
Y_dims.data(),
|
||||
T(1),
|
||||
X_data,
|
||||
Y_data,
|
||||
|
|
|
|||
|
|
@ -31,34 +31,34 @@ class CAFFE2_API DefaultEngine {};
|
|||
|
||||
namespace math {
|
||||
|
||||
#define C10_DECLARE_COMPARE_OP(Comp) \
|
||||
template <typename T, class Context, bool kBroadcast1st = false> \
|
||||
void Rowwise##Comp( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const T* A, \
|
||||
const T* B, \
|
||||
bool* C, \
|
||||
Context* context); \
|
||||
\
|
||||
template <typename T, class Context, bool kBroadcast1st = false> \
|
||||
void Colwise##Comp( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const T* A, \
|
||||
const T* B, \
|
||||
bool* C, \
|
||||
Context* context); \
|
||||
\
|
||||
template <typename T, class Context> \
|
||||
void Comp( \
|
||||
const int A_ndim, \
|
||||
const int* A_dims, \
|
||||
const int B_ndim, \
|
||||
const int* B_dims, \
|
||||
const T* A, \
|
||||
const T* B, \
|
||||
bool* C, \
|
||||
#define C10_DECLARE_COMPARE_OP(Comp) \
|
||||
template <typename T, class Context, bool kBroadcast1st = false> \
|
||||
void Rowwise##Comp( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const T* A, \
|
||||
const T* B, \
|
||||
bool* C, \
|
||||
Context* context); \
|
||||
\
|
||||
template <typename T, class Context, bool kBroadcast1st = false> \
|
||||
void Colwise##Comp( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const T* A, \
|
||||
const T* B, \
|
||||
bool* C, \
|
||||
Context* context); \
|
||||
\
|
||||
template <typename T, class Context> \
|
||||
void Comp( \
|
||||
const int A_ndim, \
|
||||
const int* A_dims, \
|
||||
const int B_ndim, \
|
||||
const int* B_dims, \
|
||||
const T* A, \
|
||||
const T* B, \
|
||||
bool* C, \
|
||||
Context* context);
|
||||
|
||||
C10_DECLARE_COMPARE_OP(EQ)
|
||||
|
|
@ -115,80 +115,6 @@ C10_DECLARE_BINARY_OP(BitwiseXor)
|
|||
|
||||
#undef C10_DECLARE_BINARY_OP
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void
|
||||
ReduceMin(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void
|
||||
ReduceMax(const int N, const T* x, T* y, Tensor* scratch_ptr, Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceMin(
|
||||
const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceMax(
|
||||
const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceSum(
|
||||
const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceMean(
|
||||
const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceL1(
|
||||
const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceL2(
|
||||
const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
// Broadcasts X with X_dims to Y with Y_dims.
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void Broadcast(
|
||||
|
|
@ -337,9 +263,6 @@ CAFFE2_API void Gemv(
|
|||
Context* context,
|
||||
TensorProto::DataType math_type = TensorProto_DataType_FLOAT);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void Set(const size_t N, const T alpha, T* X, Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void
|
||||
RandUniform(const size_t n, const T a, const T b, T* r, Context* context);
|
||||
|
|
@ -409,25 +332,6 @@ CAFFE2_API void Select(
|
|||
T* y,
|
||||
Context* context);
|
||||
|
||||
template <typename TAlpha, typename TData, class Context>
|
||||
CAFFE2_API void Scale(
|
||||
const int N,
|
||||
const TAlpha alpha,
|
||||
const TData* x,
|
||||
TData* y,
|
||||
Context* context);
|
||||
|
||||
// Different from the Scale function above, if alpha is passed in
|
||||
// as a pointer, we will assume that it lives on the Context device,
|
||||
// for example on GPU.
|
||||
template <typename TAlpha, typename TData, class Context>
|
||||
CAFFE2_API void Scale(
|
||||
const int N,
|
||||
const TAlpha* alpha,
|
||||
const TData* x,
|
||||
TData* y,
|
||||
Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void
|
||||
Axpy(const int N, const float alpha, const T* x, T* y, Context* context);
|
||||
|
|
|
|||
|
|
@ -3,6 +3,10 @@
|
|||
#include <algorithm>
|
||||
#include <functional>
|
||||
|
||||
#ifdef CAFFE2_USE_ACCELERATE
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#endif // CAFFE2_USE_ACCELERATE
|
||||
|
||||
#ifdef CAFFE2_USE_MKL
|
||||
#include <mkl.h>
|
||||
#endif // CAFFE2_USE_MKL
|
||||
|
|
@ -73,25 +77,25 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Erf, vsErf)
|
|||
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Erf, vdErf)
|
||||
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
|
||||
|
||||
#define DELEGATE_SINCOS_FUNCTION(T, MKLFunc) \
|
||||
#define DELEGATE_SINCOS(T, MKLFunc) \
|
||||
template <> \
|
||||
C10_EXPORT void SinCos<T, CPUContext>( \
|
||||
const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \
|
||||
MKLFunc(N, X, S, C); \
|
||||
}
|
||||
DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
|
||||
DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
|
||||
#undef DELEGATE_SINCOS_FUNCTION
|
||||
DELEGATE_SINCOS(float, vsSinCos)
|
||||
DELEGATE_SINCOS(double, vdSinCos)
|
||||
#undef DELEGATE_SINCOS
|
||||
|
||||
#define DELEGATE_POWX_FUNCTION(T, MKLFunc) \
|
||||
#define DELEGATE_POWX(T, MKLFunc) \
|
||||
template <> \
|
||||
C10_EXPORT void Powx<T, CPUContext>( \
|
||||
const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \
|
||||
MKLFunc(N, A, b, Y); \
|
||||
}
|
||||
DELEGATE_POWX_FUNCTION(float, vsPowx)
|
||||
DELEGATE_POWX_FUNCTION(double, vdPowx)
|
||||
#undef DELEGATE_POWX_FUNCTION
|
||||
DELEGATE_POWX(float, vsPowx)
|
||||
DELEGATE_POWX(double, vdPowx)
|
||||
#undef DELEGATE_POWX
|
||||
|
||||
#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc) \
|
||||
template <> \
|
||||
|
|
@ -228,6 +232,155 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /)
|
|||
|
||||
#endif // CAFFE2_USE_MKL
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// BLAS alternatives.
|
||||
// Depending on whether we have specified an external BLAS library or not, we
|
||||
// will delegate the Caffe math functions that are BLAS-related to either the
|
||||
// CBLAS call or the Eigen implementation.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
|
||||
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (X == Y) { \
|
||||
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
|
||||
} else { \
|
||||
EigenVectorArrayMap<TData>(Y, N) = \
|
||||
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
|
||||
} \
|
||||
} \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha* alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (X == Y) { \
|
||||
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
|
||||
} else { \
|
||||
EigenVectorArrayMap<TData>(Y, N) = \
|
||||
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
|
||||
} \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_SCALE(float, float)
|
||||
CAFFE2_SPECIALIZED_SCALE(double, double)
|
||||
CAFFE2_SPECIALIZED_SCALE(float, double)
|
||||
#undef CAFFE2_SPECIALIZED_SCALE
|
||||
|
||||
#else // CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
|
||||
#ifdef CAFFE2_USE_MKL
|
||||
|
||||
#define DELEGATE_SCALE(TAlpha, TData, MKLFunc1, MKLFunc2) \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (Y == X) { \
|
||||
MKLFunc1(N, static_cast<TData>(alpha), Y, 1); \
|
||||
} else { \
|
||||
MKLFunc2(N, static_cast<TData>(alpha), X, 1, TData(0), Y, 1); \
|
||||
} \
|
||||
} \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha* alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (Y == X) { \
|
||||
MKLFunc1(N, static_cast<TData>(*alpha), Y, 1); \
|
||||
} else { \
|
||||
MKLFunc2(N, static_cast<TData>(*alpha), X, 1, TData(0), Y, 1); \
|
||||
} \
|
||||
}
|
||||
DELEGATE_SCALE(float, float, cblas_sscal, cblas_saxpby)
|
||||
DELEGATE_SCALE(double, double, cblas_dscal, cblas_daxpby)
|
||||
DELEGATE_SCALE(float, double, cblas_dscal, cblas_daxpby)
|
||||
#undef DELEGATE_SCALE
|
||||
|
||||
#else // CAFFE2_USE_MKL
|
||||
|
||||
#define DELEGATE_SCALE(TAlpha, TData, BLASFunc) \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (Y == X) { \
|
||||
BLASFunc(N, static_cast<TData>(alpha), Y, 1); \
|
||||
} else { \
|
||||
EigenVectorArrayMap<TData>(Y, N) = \
|
||||
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
|
||||
} \
|
||||
} \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha* alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (Y == X) { \
|
||||
BLASFunc(N, static_cast<TData>(*alpha), Y, 1); \
|
||||
} else { \
|
||||
EigenVectorArrayMap<TData>(Y, N) = \
|
||||
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
|
||||
} \
|
||||
}
|
||||
DELEGATE_SCALE(float, float, cblas_sscal)
|
||||
DELEGATE_SCALE(double, double, cblas_dscal)
|
||||
DELEGATE_SCALE(float, double, cblas_dscal)
|
||||
#undef DELEGATE_SCALE
|
||||
|
||||
#endif // CAFFE2_USE_MKL
|
||||
|
||||
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Common math functions being used in Caffe that do not have a BLAS or MKL
|
||||
// equivalent. For all these functions, we will simply implement them either via
|
||||
// Eigen or via custom code.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CAFFE2_SPECIALIZED_SET(T) \
|
||||
template <> \
|
||||
C10_EXPORT void Set<T, CPUContext>( \
|
||||
const int N, const T alpha, T* Y, CPUContext* /* context */) { \
|
||||
if (N == 0) { \
|
||||
return; \
|
||||
} \
|
||||
if (alpha == T(0)) { \
|
||||
std::memset(Y, 0, N * sizeof(T)); \
|
||||
} else { \
|
||||
EigenVectorArrayMap<T>(Y, N).setConstant(alpha); \
|
||||
} \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_SET(float)
|
||||
CAFFE2_SPECIALIZED_SET(double)
|
||||
CAFFE2_SPECIALIZED_SET(int)
|
||||
CAFFE2_SPECIALIZED_SET(std::int8_t)
|
||||
CAFFE2_SPECIALIZED_SET(std::int16_t)
|
||||
CAFFE2_SPECIALIZED_SET(std::int64_t)
|
||||
CAFFE2_SPECIALIZED_SET(bool)
|
||||
CAFFE2_SPECIALIZED_SET(char)
|
||||
CAFFE2_SPECIALIZED_SET(std::uint8_t)
|
||||
CAFFE2_SPECIALIZED_SET(std::uint16_t)
|
||||
#undef CAFFE2_SPECIALIZED_SET
|
||||
|
||||
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \
|
||||
template <> \
|
||||
C10_EXPORT void Func<T, CPUContext>( \
|
||||
|
|
@ -262,6 +415,39 @@ CAFFE2_SPECIALIZED_NEG(float)
|
|||
CAFFE2_SPECIALIZED_NEG(double)
|
||||
#undef CAFFE2_SPECIALIZED_NEG
|
||||
|
||||
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (X == Y) { \
|
||||
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
|
||||
} else { \
|
||||
EigenVectorArrayMap<TData>(Y, N) = \
|
||||
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
|
||||
} \
|
||||
} \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int N, \
|
||||
const TAlpha* alpha, \
|
||||
const TData* X, \
|
||||
TData* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
if (X == Y) { \
|
||||
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
|
||||
} else { \
|
||||
EigenVectorArrayMap<TData>(Y, N) = \
|
||||
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
|
||||
} \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
|
||||
CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
|
||||
#undef CAFFE2_SPECIALIZED_SCALE
|
||||
|
||||
#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
|
||||
template <> \
|
||||
C10_EXPORT void Func<T, CPUContext>( \
|
||||
|
|
@ -286,8 +472,12 @@ DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Div, /)
|
|||
EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N).EigenFunc( \
|
||||
ConstEigenVectorArrayMap<T>(B, N)); \
|
||||
}
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Min, min)
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Min, min)
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min)
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min)
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Max, max)
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Max, max)
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max)
|
||||
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max)
|
||||
#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION
|
||||
|
|
|
|||
|
|
@ -56,6 +56,19 @@ CAFFE2_API void Inv(int N, const T* X, T* Y, Context* context);
|
|||
template <typename T, class Context>
|
||||
CAFFE2_API void Erf(int N, const T* X, T* Y, Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void Set(int N, T alpha, T* X, Context* context);
|
||||
|
||||
template <typename TAlpha, typename TData, class Context>
|
||||
CAFFE2_API void
|
||||
Scale(int N, TAlpha alpha, const TData* X, TData* Y, Context* context);
|
||||
|
||||
// Different from the Scale function above, if alpha is passed in as a pointer,
|
||||
// we will assume that it lives on the Context device, for example on GPU.
|
||||
template <typename TAlpha, typename TData, class Context>
|
||||
CAFFE2_API void
|
||||
Scale(int N, const TAlpha* alpha, const TData* X, TData* Y, Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void Add(int N, const T* A, const T* B, T* C, Context* context);
|
||||
template <typename T, class Context>
|
||||
|
|
|
|||
|
|
@ -6,8 +6,17 @@
|
|||
#include <numeric>
|
||||
#include <vector>
|
||||
|
||||
#ifdef CAFFE2_USE_ACCELERATE
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#endif // CAFFE2_USE_ACCELERATE
|
||||
|
||||
#ifdef CAFFE2_USE_MKL
|
||||
#include <mkl.h>
|
||||
#endif // CAFFE2_USE_MKL
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/utils/eigen_utils.h"
|
||||
#include "caffe2/utils/math/elementwise.h"
|
||||
#include "caffe2/utils/math/utils.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
|
@ -15,9 +24,385 @@ namespace math {
|
|||
|
||||
namespace {
|
||||
|
||||
#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc) \
|
||||
template <typename T> \
|
||||
void Rowwise##Func( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
EigenVectorMap<T>(Y, rows) = \
|
||||
ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenFunc() * alpha; \
|
||||
}
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
|
||||
#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
|
||||
|
||||
#ifndef CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
|
||||
#define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \
|
||||
template <> \
|
||||
void Rowwise##Func( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CPUContext* /* context */) { \
|
||||
for (int i = 0; i < rows; ++i) { \
|
||||
Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha; \
|
||||
} \
|
||||
}
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL1, cblas_sasum)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL1, cblas_dasum)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL2, cblas_snrm2)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL2, cblas_dnrm2)
|
||||
#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
|
||||
|
||||
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
|
||||
#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc) \
|
||||
template <typename T> \
|
||||
void Colwise##Func( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CPUContext* context) { \
|
||||
std::memcpy(Y, X, sizeof(T) * cols); \
|
||||
for (int i = 1; i < rows; ++i) { \
|
||||
MathFunc<T, CPUContext>(cols, Y, X + i * cols, Y, context); \
|
||||
} \
|
||||
Scale<T, T, CPUContext>(cols, alpha, Y, Y, context); \
|
||||
}
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min)
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max)
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, Add)
|
||||
#undef DELEGATE_COLWISE_REDUCE_FUNCTION
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void
|
||||
RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
|
||||
void ColwiseReduceMean(
|
||||
const int rows,
|
||||
const int cols,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ColwiseReduceSum<T>(rows, cols, alpha / static_cast<T>(rows), X, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ColwiseReduceL1(
|
||||
const int rows,
|
||||
const int cols,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ConstEigenArrayMap<T> X_arr(X, cols, rows);
|
||||
EigenVectorArrayMap<T> Y_arr(Y, cols);
|
||||
Y_arr = X_arr.col(0).abs();
|
||||
for (int i = 1; i < rows; ++i) {
|
||||
Y_arr += X_arr.col(i).abs();
|
||||
}
|
||||
Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ColwiseReduceL2(
|
||||
const int rows,
|
||||
const int cols,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* /* context */) {
|
||||
ConstEigenArrayMap<T> X_arr(X, cols, rows);
|
||||
EigenVectorArrayMap<T> Y_arr(Y, cols);
|
||||
Y_arr = X_arr.col(0).square();
|
||||
for (int i = 1; i < rows; ++i) {
|
||||
Y_arr += X_arr.col(i).square();
|
||||
}
|
||||
Y_arr = Y_arr.sqrt() * alpha;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BothEndsReduceMin(
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, N);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().minCoeff();
|
||||
for (int i = 1; i < M; ++i) {
|
||||
ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
|
||||
for (int j = 0; j < N; ++j) {
|
||||
Y[j] = std::min(Y[j], X_arr.col(j).minCoeff());
|
||||
}
|
||||
}
|
||||
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BothEndsReduceMax(
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, N);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().maxCoeff();
|
||||
for (int i = 1; i < M; ++i) {
|
||||
ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
|
||||
for (int j = 0; j < N; ++j) {
|
||||
Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff());
|
||||
}
|
||||
}
|
||||
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BothEndsReduceSum(
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, N);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
|
||||
for (int i = 1; i < M; ++i) {
|
||||
Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum();
|
||||
}
|
||||
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BothEndsReduceMean(
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, N);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().mean();
|
||||
for (int i = 1; i < M; ++i) {
|
||||
Y_arr += ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().mean();
|
||||
}
|
||||
Scale<T, T, CPUContext>(N, alpha / static_cast<T>(M), Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BothEndsReduceL1(
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
EigenVectorMap<T> Y_vec(Y, N);
|
||||
Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().template lpNorm<1>();
|
||||
for (int i = 1; i < M; ++i) {
|
||||
Y_vec += ConstEigenMatrixMap<T>(X + i * N * K, K, N)
|
||||
.colwise()
|
||||
.template lpNorm<1>();
|
||||
}
|
||||
Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BothEndsReduceL2(
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* /* context */) {
|
||||
EigenVectorMap<T> Y_vec(Y, N);
|
||||
Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().squaredNorm();
|
||||
for (int i = 1; i < M; ++i) {
|
||||
Y_vec +=
|
||||
ConstEigenMatrixMap<T>(X + i * N * K, K, N).colwise().squaredNorm();
|
||||
}
|
||||
Y_vec = Y_vec.cwiseSqrt() * alpha;
|
||||
}
|
||||
|
||||
template <typename T, class Reducer>
|
||||
void ReduceTensorImpl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const Reducer& reducer,
|
||||
const T init,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
const int X_size =
|
||||
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
Set<T, CPUContext>(Y_size, init, Y, context);
|
||||
std::vector<int> index(ndim, 0);
|
||||
for (int X_index = 0; X_index < X_size; ++X_index) {
|
||||
const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
|
||||
Y[Y_index] = reducer(Y[Y_index], X[X_index]);
|
||||
utils::IncreaseIndexInDims(ndim, X_dims, index.data());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceMinImpl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ReduceTensorImpl(
|
||||
ndim,
|
||||
X_dims,
|
||||
Y_dims,
|
||||
[](const T a, const T b) { return std::min(a, b); },
|
||||
std::numeric_limits<T>::max(),
|
||||
X,
|
||||
Y,
|
||||
context);
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceMaxImpl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ReduceTensorImpl(
|
||||
ndim,
|
||||
X_dims,
|
||||
Y_dims,
|
||||
[](const T a, const T b) { return std::max(a, b); },
|
||||
std::numeric_limits<T>::lowest(),
|
||||
X,
|
||||
Y,
|
||||
context);
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceSumImpl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceMeanImpl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context);
|
||||
const int X_size =
|
||||
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
Scale<T, T, CPUContext>(
|
||||
Y_size,
|
||||
alpha * static_cast<T>(Y_size) / static_cast<T>(X_size),
|
||||
Y,
|
||||
Y,
|
||||
context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceL1Impl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ReduceTensorImpl(
|
||||
ndim,
|
||||
X_dims,
|
||||
Y_dims,
|
||||
[](const T a, const T b) { return a + std::abs(b); },
|
||||
T(0),
|
||||
X,
|
||||
Y,
|
||||
context);
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceL2Impl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
ReduceTensorImpl(
|
||||
ndim,
|
||||
X_dims,
|
||||
Y_dims,
|
||||
[](const T a, const T b) { return a + b * b; },
|
||||
T(0),
|
||||
X,
|
||||
Y,
|
||||
context);
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
EigenVectorArrayMap<T> Y_arr(Y, Y_size);
|
||||
Y_arr = Y_arr.sqrt() * alpha;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void RowwiseMoments(
|
||||
const int rows,
|
||||
const int cols,
|
||||
const T* X,
|
||||
T* mean,
|
||||
T* var) {
|
||||
ConstEigenArrayMap<T> X_arr(X, cols, rows);
|
||||
EigenVectorArrayMap<T> mean_arr(mean, rows);
|
||||
EigenVectorArrayMap<T> var_arr(var, rows);
|
||||
|
|
@ -26,15 +411,19 @@ RowwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void
|
||||
ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
|
||||
std::memset(mean, 0, sizeof(T) * cols);
|
||||
std::memset(var, 0, sizeof(T) * cols);
|
||||
void ColwiseMoments(
|
||||
const int rows,
|
||||
const int cols,
|
||||
const T* X,
|
||||
T* mean,
|
||||
T* var) {
|
||||
ConstEigenArrayMap<T> X_arr(X, cols, rows);
|
||||
EigenVectorArrayMap<T> mean_arr(mean, cols);
|
||||
EigenVectorArrayMap<T> var_arr(var, cols);
|
||||
// Eigen rowwise reduction is about 10 times slower than this for-loop.
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
mean_arr = X_arr.col(0);
|
||||
var_arr = X_arr.col(0).square();
|
||||
for (int i = 1; i < rows; ++i) {
|
||||
mean_arr += X_arr.col(i);
|
||||
var_arr += X_arr.col(i).square();
|
||||
}
|
||||
|
|
@ -44,32 +433,30 @@ ColwiseMoments(const int rows, const int cols, const T* X, T* mean, T* var) {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void BothEndsMoments(
|
||||
const int pre,
|
||||
const int mid,
|
||||
const int nxt,
|
||||
void BothEndsMoments(
|
||||
const int M,
|
||||
const int N,
|
||||
const int K,
|
||||
const T* X,
|
||||
T* mean,
|
||||
T* var) {
|
||||
std::memset(mean, 0, sizeof(T) * mid);
|
||||
std::memset(var, 0, sizeof(T) * mid);
|
||||
EigenVectorArrayMap<T> mean_arr(mean, mid);
|
||||
EigenVectorArrayMap<T> var_arr(var, mid);
|
||||
ConstEigenArrayMap<T> X_arr(X, nxt, pre * mid);
|
||||
for (int i = 0; i < pre; ++i) {
|
||||
for (int j = 0; j < mid; ++j) {
|
||||
const int c = i * mid + j;
|
||||
mean_arr(j) += X_arr.col(c).sum();
|
||||
var_arr(j) += X_arr.col(c).square().sum();
|
||||
}
|
||||
EigenVectorArrayMap<T> mean_arr(mean, N);
|
||||
EigenVectorArrayMap<T> var_arr(var, N);
|
||||
ConstEigenArrayMap<T> X0_arr(X, K, N);
|
||||
mean_arr = X0_arr.colwise().sum();
|
||||
var_arr = X0_arr.square().colwise().sum();
|
||||
for (int i = 1; i < M; ++i) {
|
||||
ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
|
||||
mean_arr += X_arr.colwise().sum();
|
||||
var_arr += X_arr.square().colwise().sum();
|
||||
}
|
||||
const T scale = T(1) / static_cast<T>(pre * nxt);
|
||||
const T scale = T(1) / static_cast<T>(M * K);
|
||||
mean_arr *= scale;
|
||||
var_arr = var_arr * scale - mean_arr.square();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void MomentsImpl(
|
||||
void MomentsImpl(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
|
|
@ -126,6 +513,128 @@ C10_EXPORT void MomentsImpl(
|
|||
|
||||
} // namespace
|
||||
|
||||
#define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \
|
||||
template <> \
|
||||
C10_EXPORT void Func<T, CPUContext>( \
|
||||
const int N, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
Tensor* /* scratch_ptr */, \
|
||||
CPUContext* /* context */) { \
|
||||
*Y = ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \
|
||||
}
|
||||
DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMin, minCoeff)
|
||||
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMin, minCoeff)
|
||||
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMin, minCoeff)
|
||||
DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMax, maxCoeff)
|
||||
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMax, maxCoeff)
|
||||
DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMax, maxCoeff)
|
||||
#undef DELEGATE_GLOBAL_REDUCE_FUNCTION
|
||||
|
||||
#define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm) \
|
||||
template <> \
|
||||
C10_EXPORT void Func<T, CPUContext>( \
|
||||
const int ndim, \
|
||||
const int* X_dims, \
|
||||
const int* Y_dims, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CPUContext* context) { \
|
||||
const int X_size = \
|
||||
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); \
|
||||
const int Y_size = \
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); \
|
||||
if (X_size == 0) { \
|
||||
Set<T, CPUContext>(Y_size, alpha * kInit, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (alpha == T(0)) { \
|
||||
std::memset(Y, 0, sizeof(T) * Y_size); \
|
||||
return; \
|
||||
} \
|
||||
if (std::equal(X_dims, X_dims + ndim, Y_dims)) { \
|
||||
if (kIsNorm) { \
|
||||
EigenVectorArrayMap<T>(Y, Y_size) = \
|
||||
ConstEigenVectorArrayMap<T>(X, X_size).abs() * alpha; \
|
||||
} else { \
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
int rows; \
|
||||
int cols; \
|
||||
if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
|
||||
Rowwise##Func<T>(rows, cols, alpha, X, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
|
||||
Colwise##Func<T>(rows, cols, alpha, X, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
int M; \
|
||||
int N; \
|
||||
int K; \
|
||||
if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) { \
|
||||
BothEnds##Func<T>(M, N, K, alpha, X, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
Func##Impl<T>(ndim, X_dims, Y_dims, alpha, X, Y, context); \
|
||||
}
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
float,
|
||||
ReduceMin,
|
||||
std::numeric_limits<float>::max(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
double,
|
||||
ReduceMin,
|
||||
std::numeric_limits<double>::max(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int32_t,
|
||||
ReduceMin,
|
||||
std::numeric_limits<std::int32_t>::max(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int64_t,
|
||||
ReduceMin,
|
||||
std::numeric_limits<std::int64_t>::max(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
float,
|
||||
ReduceMax,
|
||||
std::numeric_limits<float>::lowest(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
double,
|
||||
ReduceMax,
|
||||
std::numeric_limits<double>::lowest(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int32_t,
|
||||
ReduceMax,
|
||||
std::numeric_limits<std::int32_t>::lowest(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int64_t,
|
||||
ReduceMax,
|
||||
std::numeric_limits<std::int64_t>::lowest(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(float, ReduceSum, 0.0f, false)
|
||||
DELEGATE_REDUCE_FUNCTION(double, ReduceSum, 0.0, false)
|
||||
DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceSum, 0, false)
|
||||
DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceSum, 0LL, false)
|
||||
DELEGATE_REDUCE_FUNCTION(float, ReduceMean, 0.0f, false)
|
||||
DELEGATE_REDUCE_FUNCTION(double, ReduceMean, 0.0, false)
|
||||
DELEGATE_REDUCE_FUNCTION(float, ReduceL1, 0.0f, true)
|
||||
DELEGATE_REDUCE_FUNCTION(double, ReduceL1, 0.0, true)
|
||||
DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceL1, 0, true)
|
||||
DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceL1, 0LL, true)
|
||||
DELEGATE_REDUCE_FUNCTION(float, ReduceL2, 0.0f, true)
|
||||
DELEGATE_REDUCE_FUNCTION(double, ReduceL2, 0.0, true)
|
||||
#undef DELEGATE_REDUCE_FUNCTION
|
||||
|
||||
#define CAFFE2_SPECIALIZED_MOMENTS(T) \
|
||||
template <> \
|
||||
C10_EXPORT void Moments<T, CPUContext>( \
|
||||
|
|
|
|||
|
|
@ -5,8 +5,90 @@
|
|||
#include "caffe2/core/types.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class Tensor;
|
||||
|
||||
namespace math {
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void
|
||||
ReduceMin(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
|
||||
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void
|
||||
ReduceMax(const int N, const T* X, T* y, Tensor* scratch_ptr, Context* context);
|
||||
|
||||
// In all of the reduce functions, X_dims and Y_dims should have ndim elements.
|
||||
// Each dimension of Y_dims must match the corresponding dimension of X_dims or
|
||||
// must be equal to 1. The dimensions equal to 1 indicate the dimensions of X to
|
||||
// be reduced.
|
||||
|
||||
// Y = alpha * ReduceMin(X)
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceMin(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
// Y = alpha * ReduceMax(X)
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceMax(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
// Y = alpha * ReduceSum(X)
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceSum(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
// Y = alpha * ReduceMean(X)
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceMean(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
// Y = alpha * ReduceL1(X)
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceL1(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
// Y = alpha * ReduceL2(X)
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void ReduceL2(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
Context* context);
|
||||
|
||||
// Computes mean and variance over axes.
|
||||
template <typename T, class Context>
|
||||
CAFFE2_API void Moments(
|
||||
|
|
@ -19,6 +101,7 @@ CAFFE2_API void Moments(
|
|||
Context* context);
|
||||
|
||||
} // namespace math
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_UTILS_MATH_REDUCE_H_
|
||||
|
|
|
|||
|
|
@ -375,40 +375,6 @@ C10_EXPORT void Gemv<float, CPUContext>(
|
|||
cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
|
||||
}
|
||||
|
||||
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix) \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int n, \
|
||||
const TAlpha alpha, \
|
||||
const TData* x, \
|
||||
TData* y, \
|
||||
CPUContext*) { \
|
||||
if (y != x) { \
|
||||
cblas_##prefix##copy(n, x, 1, y, 1); \
|
||||
} \
|
||||
if (alpha != TAlpha(1)) { \
|
||||
cblas_##prefix##scal(n, static_cast<TData>(alpha), y, 1); \
|
||||
} \
|
||||
} \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int n, \
|
||||
const TAlpha* alpha, \
|
||||
const TData* x, \
|
||||
TData* y, \
|
||||
CPUContext*) { \
|
||||
if (y != x) { \
|
||||
cblas_##prefix##copy(n, x, 1, y, 1); \
|
||||
} \
|
||||
if (*alpha != TAlpha(1)) { \
|
||||
cblas_##prefix##scal(n, static_cast<TData>(*alpha), y, 1); \
|
||||
} \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_SCALE(float, float, s)
|
||||
CAFFE2_SPECIALIZED_SCALE(double, double, d)
|
||||
CAFFE2_SPECIALIZED_SCALE(float, double, d)
|
||||
#undef CAFFE2_SPECIALIZED_SCALE
|
||||
|
||||
#define CAFFE2_SPECIALIZED_DOT(T, prefix) \
|
||||
template <> \
|
||||
C10_EXPORT void Dot<T, CPUContext>( \
|
||||
|
|
@ -486,36 +452,6 @@ CAFFE2_SPECIALIZED_AXPBY(float, s)
|
|||
|
||||
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
|
||||
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int n, \
|
||||
const TAlpha alpha, \
|
||||
const TData* x, \
|
||||
TData* y, \
|
||||
CPUContext* /* context */) { \
|
||||
EigenVectorMap<TData>(y, n) = \
|
||||
ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha); \
|
||||
} \
|
||||
template <> \
|
||||
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
|
||||
const int n, \
|
||||
const TAlpha* alpha, \
|
||||
const TData* x, \
|
||||
TData* y, \
|
||||
CPUContext* /* context */) { \
|
||||
EigenVectorMap<TData>(y, n) = \
|
||||
ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(*alpha); \
|
||||
}
|
||||
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
CAFFE2_SPECIALIZED_SCALE(float, float)
|
||||
CAFFE2_SPECIALIZED_SCALE(double, double)
|
||||
CAFFE2_SPECIALIZED_SCALE(float, double)
|
||||
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
|
||||
CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
|
||||
CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
|
||||
#undef CAFFE2_SPECIALIZED_SCALE
|
||||
|
||||
template <>
|
||||
C10_EXPORT void GemmBatched<float, CPUContext>(
|
||||
const CBLAS_TRANSPOSE trans_A,
|
||||
|
|
@ -628,563 +564,6 @@ C10_EXPORT void GemmStridedBatched<float, CPUContext>(
|
|||
// Eigen or via custom code.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CAFFE2_SPECIALIZED_SET(T) \
|
||||
template <> \
|
||||
C10_EXPORT void Set<T, CPUContext>( \
|
||||
const size_t N, const T alpha, T* Y, CPUContext*) { \
|
||||
if (N == 0) { \
|
||||
return; \
|
||||
} \
|
||||
if (alpha == (T)0) { \
|
||||
if (Y != nullptr) { \
|
||||
std::memset(Y, 0, N * sizeof(T)); \
|
||||
} \
|
||||
} else { \
|
||||
EigenVectorMap<T>(Y, N).setConstant(alpha); \
|
||||
} \
|
||||
}
|
||||
|
||||
CAFFE2_SPECIALIZED_SET(float);
|
||||
CAFFE2_SPECIALIZED_SET(double);
|
||||
CAFFE2_SPECIALIZED_SET(int8_t);
|
||||
CAFFE2_SPECIALIZED_SET(int16_t);
|
||||
CAFFE2_SPECIALIZED_SET(int);
|
||||
CAFFE2_SPECIALIZED_SET(int64_t);
|
||||
CAFFE2_SPECIALIZED_SET(bool);
|
||||
CAFFE2_SPECIALIZED_SET(char);
|
||||
CAFFE2_SPECIALIZED_SET(uint8_t);
|
||||
CAFFE2_SPECIALIZED_SET(uint16_t);
|
||||
#undef CAFFE2_SPECIALIZED_SET
|
||||
|
||||
#define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
|
||||
template <> \
|
||||
C10_EXPORT void ReduceMin<T, CPUContext>( \
|
||||
const int N, \
|
||||
const T* x, \
|
||||
T* y, \
|
||||
Tensor* /*scratch_ptr*/, \
|
||||
CPUContext* /*context*/) { \
|
||||
*y = ConstEigenVectorArrayMap<T>(x, N).minCoeff(); \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_REDUCEMIN(float)
|
||||
#undef CAFFE2_SPECIALIZED_REDUCEMIN
|
||||
|
||||
#define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
|
||||
template <> \
|
||||
C10_EXPORT void ReduceMax<T, CPUContext>( \
|
||||
const int N, \
|
||||
const T* x, \
|
||||
T* y, \
|
||||
Tensor* /*scratch_ptr*/, \
|
||||
CPUContext* /*context*/) { \
|
||||
*y = ConstEigenVectorArrayMap<T>(x, N).maxCoeff(); \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_REDUCEMAX(float)
|
||||
CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
|
||||
CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
|
||||
|
||||
#undef CAFFE2_SPECIALIZED_REDUCEMAX
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct MinFunctor {
|
||||
inline T operator()(const T a, const T b) const {
|
||||
return std::min(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct MaxFunctor {
|
||||
inline T operator()(const T a, const T b) const {
|
||||
return std::max(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct L1NormFunctor {
|
||||
inline T operator()(const T a, const T b) const {
|
||||
return a + std::abs(b);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct SquaredL2NormFunctor {
|
||||
inline T operator()(const T a, const T b) const {
|
||||
return a + b * b;
|
||||
}
|
||||
};
|
||||
|
||||
#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp) \
|
||||
template <typename T> \
|
||||
C10_EXPORT void Rowwise##Func( \
|
||||
const int rows, const int cols, const T alpha, const T* X, T* Y) { \
|
||||
EigenVectorMap<T>(Y, rows) = \
|
||||
ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \
|
||||
}
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
|
||||
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
|
||||
#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
|
||||
|
||||
#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp) \
|
||||
template <typename T> \
|
||||
C10_EXPORT void Colwise##Func( \
|
||||
const int rows, const int cols, const T alpha, const T* X, T* Y) { \
|
||||
EigenVectorMap<T>(Y, cols) = \
|
||||
ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \
|
||||
}
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, sum)
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMean, mean)
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
|
||||
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm)
|
||||
#undef DELEGATE_COLWISE_REDUCE_FUNCTION
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void BothEndsReduceMin(
|
||||
const int pre,
|
||||
const int mid,
|
||||
const int nxt,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, mid);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().minCoeff();
|
||||
const T* X_ptr = X + mid * nxt;
|
||||
// It seems there is some bug in eigen array::min so it cannot be implemented
|
||||
// as ReduceSum below.
|
||||
for (int i = 1; i < pre; ++i) {
|
||||
for (int j = 0; j < mid; ++j) {
|
||||
Y[j] = std::min(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).minCoeff());
|
||||
X_ptr += nxt;
|
||||
}
|
||||
}
|
||||
if (alpha != T(1)) {
|
||||
Y_arr *= alpha;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void BothEndsReduceMax(
|
||||
const int pre,
|
||||
const int mid,
|
||||
const int nxt,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, mid);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().maxCoeff();
|
||||
const T* X_ptr = X + mid * nxt;
|
||||
for (int i = 1; i < pre; ++i) {
|
||||
for (int j = 0; j < mid; ++j) {
|
||||
Y[j] = std::max(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).maxCoeff());
|
||||
X_ptr += nxt;
|
||||
}
|
||||
}
|
||||
if (alpha != T(1)) {
|
||||
Y_arr *= alpha;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void BothEndsReduceSum(
|
||||
const int pre,
|
||||
const int mid,
|
||||
const int nxt,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, mid);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().sum();
|
||||
const int stride = mid * nxt;
|
||||
const T* X_ptr = X + stride;
|
||||
for (int i = 1; i < pre; ++i) {
|
||||
Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().sum();
|
||||
X_ptr += stride;
|
||||
}
|
||||
if (alpha != T(1)) {
|
||||
Y_arr *= alpha;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void BothEndsReduceMean(
|
||||
const int pre,
|
||||
const int mid,
|
||||
const int nxt,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, mid);
|
||||
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().mean();
|
||||
const int stride = mid * nxt;
|
||||
const T* X_ptr = X + stride;
|
||||
for (int i = 1; i < pre; ++i) {
|
||||
Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().mean();
|
||||
X_ptr += stride;
|
||||
}
|
||||
if (alpha / static_cast<T>(pre) != 1) {
|
||||
Y_arr *= alpha / static_cast<T>(pre);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void BothEndsReduceL1(
|
||||
const int pre,
|
||||
const int mid,
|
||||
const int nxt,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, mid);
|
||||
Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid)
|
||||
.colwise()
|
||||
.template lpNorm<1>()
|
||||
.array();
|
||||
const int stride = mid * nxt;
|
||||
const T* X_ptr = X + stride;
|
||||
for (int i = 1; i < pre; ++i) {
|
||||
Y_arr += ConstEigenMatrixMap<T>(X_ptr, nxt, mid)
|
||||
.colwise()
|
||||
.template lpNorm<1>()
|
||||
.array();
|
||||
X_ptr += stride;
|
||||
}
|
||||
if (alpha != T(1)) {
|
||||
Y_arr *= alpha;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
C10_EXPORT void BothEndsReduceL2(
|
||||
const int pre,
|
||||
const int mid,
|
||||
const int nxt,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y) {
|
||||
EigenVectorArrayMap<T> Y_arr(Y, mid);
|
||||
Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid).colwise().squaredNorm().array();
|
||||
const int stride = mid * nxt;
|
||||
const T* X_ptr = X + stride;
|
||||
for (int i = 1; i < pre; ++i) {
|
||||
Y_arr +=
|
||||
ConstEigenMatrixMap<T>(X_ptr, nxt, mid).colwise().squaredNorm().array();
|
||||
X_ptr += stride;
|
||||
}
|
||||
Y_arr = Y_arr.sqrt() * alpha;
|
||||
}
|
||||
|
||||
template <typename T, class Reducer>
|
||||
C10_EXPORT void ReduceTensor(
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const Reducer& reducer,
|
||||
const T init,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CPUContext* context) {
|
||||
const int X_size =
|
||||
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
Set<T, CPUContext>(Y_size, init, Y, context);
|
||||
std::vector<int> index(ndim, 0);
|
||||
for (int X_index = 0; X_index < X_size; ++X_index) {
|
||||
const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
|
||||
Y[Y_index] = reducer(Y[Y_index], X[X_index]);
|
||||
utils::IncreaseIndexInDims(ndim, X_dims, index.data());
|
||||
}
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
#define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm) \
|
||||
template <> \
|
||||
C10_EXPORT void Func<T, CPUContext>( \
|
||||
const int num_dims, \
|
||||
const int* dims, \
|
||||
const int num_axes, \
|
||||
const int* axes, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CPUContext* context) { \
|
||||
CAFFE_ENFORCE_LE(num_axes, num_dims); \
|
||||
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
|
||||
for (int i = 0; i < num_axes; ++i) { \
|
||||
Y_dims_vector[axes[i]] = 1; \
|
||||
} \
|
||||
const int* X_dims = dims; \
|
||||
const int* Y_dims = Y_dims_vector.data(); \
|
||||
const int X_size = \
|
||||
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
|
||||
const int Y_size = \
|
||||
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
|
||||
if (X_size == 0) { \
|
||||
Set<T, CPUContext>(Y_size, alpha * init, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (alpha == T(0)) { \
|
||||
Set<T, CPUContext>(Y_size, 0, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
|
||||
if (is_norm) { \
|
||||
Abs<T, CPUContext>(X_size, X, Y, context); \
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
|
||||
} else { \
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
|
||||
} \
|
||||
return; \
|
||||
} \
|
||||
int rows; \
|
||||
int cols; \
|
||||
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
|
||||
Rowwise##Func<T>(rows, cols, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
|
||||
Colwise##Func<T>(rows, cols, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
int pre; \
|
||||
int mid; \
|
||||
int nxt; \
|
||||
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
|
||||
BothEnds##Func<T>(pre, mid, nxt, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
ReduceTensor( \
|
||||
num_dims, X_dims, Y_dims, reducer, init, alpha, X, Y, context); \
|
||||
}
|
||||
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
float,
|
||||
ReduceMin,
|
||||
MinFunctor<float>(),
|
||||
std::numeric_limits<float>::max(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
double,
|
||||
ReduceMin,
|
||||
MinFunctor<double>(),
|
||||
std::numeric_limits<double>::max(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int32_t,
|
||||
ReduceMin,
|
||||
MinFunctor<std::int32_t>(),
|
||||
std::numeric_limits<std::int32_t>::max(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int64_t,
|
||||
ReduceMin,
|
||||
MinFunctor<std::int64_t>(),
|
||||
std::numeric_limits<std::int64_t>::max(),
|
||||
false)
|
||||
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
float,
|
||||
ReduceMax,
|
||||
MaxFunctor<float>(),
|
||||
std::numeric_limits<float>::lowest(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
double,
|
||||
ReduceMax,
|
||||
MaxFunctor<double>(),
|
||||
std::numeric_limits<double>::lowest(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int32_t,
|
||||
ReduceMax,
|
||||
MaxFunctor<std::int32_t>(),
|
||||
std::numeric_limits<std::int32_t>::lowest(),
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int64_t,
|
||||
ReduceMax,
|
||||
MaxFunctor<std::int64_t>(),
|
||||
std::numeric_limits<std::int64_t>::lowest(),
|
||||
false)
|
||||
|
||||
DELEGATE_REDUCE_FUNCTION(float, ReduceSum, std::plus<float>(), 0.0f, false)
|
||||
DELEGATE_REDUCE_FUNCTION(double, ReduceSum, std::plus<double>(), 0.0, false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int32_t,
|
||||
ReduceSum,
|
||||
std::plus<std::int32_t>(),
|
||||
0,
|
||||
false)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int64_t,
|
||||
ReduceSum,
|
||||
std::plus<std::int64_t>(),
|
||||
std::int64_t(0),
|
||||
false)
|
||||
|
||||
DELEGATE_REDUCE_FUNCTION(float, ReduceL1, L1NormFunctor<float>(), 0.0f, true)
|
||||
DELEGATE_REDUCE_FUNCTION(double, ReduceL1, L1NormFunctor<double>(), 0.0, true)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int32_t,
|
||||
ReduceL1,
|
||||
L1NormFunctor<std::int32_t>(),
|
||||
0,
|
||||
true)
|
||||
DELEGATE_REDUCE_FUNCTION(
|
||||
std::int64_t,
|
||||
ReduceL1,
|
||||
L1NormFunctor<std::int64_t>(),
|
||||
std::int64_t(0),
|
||||
true)
|
||||
|
||||
#undef DELEGATE_REDUCE_FUNCTION
|
||||
|
||||
#define CAFFE2_SPECIALIZED_REDUCE_MEAN(T) \
|
||||
template <> \
|
||||
C10_EXPORT void ReduceMean<T, CPUContext>( \
|
||||
const int num_dims, \
|
||||
const int* dims, \
|
||||
const int num_axes, \
|
||||
const int* axes, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CPUContext* context) { \
|
||||
CAFFE_ENFORCE_LE(num_axes, num_dims); \
|
||||
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
|
||||
for (int i = 0; i < num_axes; ++i) { \
|
||||
Y_dims_vector[axes[i]] = 1; \
|
||||
} \
|
||||
const int* X_dims = dims; \
|
||||
const int* Y_dims = Y_dims_vector.data(); \
|
||||
const int X_size = \
|
||||
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
|
||||
const int Y_size = \
|
||||
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
|
||||
if (X_size == 0) { \
|
||||
Set<T, CPUContext>(Y_size, 0, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (alpha == T(0)) { \
|
||||
Set<T, CPUContext>(Y_size, 0, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
|
||||
Scale<T, T, CPUContext>(X_size, alpha, X, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
int rows; \
|
||||
int cols; \
|
||||
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
|
||||
RowwiseReduceMean<T>(rows, cols, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
|
||||
ColwiseReduceMean<T>(rows, cols, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
int pre; \
|
||||
int mid; \
|
||||
int nxt; \
|
||||
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
|
||||
BothEndsReduceMean<T>(pre, mid, nxt, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
const int scale = X_size / Y_size; \
|
||||
ReduceTensor( \
|
||||
num_dims, \
|
||||
X_dims, \
|
||||
Y_dims, \
|
||||
std::plus<T>(), \
|
||||
T(0), \
|
||||
alpha / static_cast<T>(scale), \
|
||||
X, \
|
||||
Y, \
|
||||
context); \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_REDUCE_MEAN(float)
|
||||
CAFFE2_SPECIALIZED_REDUCE_MEAN(double)
|
||||
#undef CAFFE2_SPECIALIZED_REDUCE_MEAN
|
||||
|
||||
#define CAFFE2_SPECIALIZED_REDUCE_L2(T) \
|
||||
template <> \
|
||||
C10_EXPORT void ReduceL2<T, CPUContext>( \
|
||||
const int num_dims, \
|
||||
const int* dims, \
|
||||
const int num_axes, \
|
||||
const int* axes, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CPUContext* context) { \
|
||||
CAFFE_ENFORCE_LE(num_axes, num_dims); \
|
||||
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
|
||||
for (int i = 0; i < num_axes; ++i) { \
|
||||
Y_dims_vector[axes[i]] = 1; \
|
||||
} \
|
||||
const int* X_dims = dims; \
|
||||
const int* Y_dims = Y_dims_vector.data(); \
|
||||
const int X_size = \
|
||||
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
|
||||
const int Y_size = \
|
||||
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
|
||||
if (X_size == 0) { \
|
||||
Set<T, CPUContext>(Y_size, 0, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (alpha == T(0)) { \
|
||||
Set<T, CPUContext>(Y_size, 0, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
|
||||
Abs<T, CPUContext>(X_size, X, Y, context); \
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
|
||||
return; \
|
||||
} \
|
||||
int rows; \
|
||||
int cols; \
|
||||
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
|
||||
RowwiseReduceL2<T>(rows, cols, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
|
||||
ColwiseReduceL2<T>(rows, cols, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
int pre; \
|
||||
int mid; \
|
||||
int nxt; \
|
||||
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
|
||||
BothEndsReduceL2<T>(pre, mid, nxt, alpha, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
ReduceTensor( \
|
||||
num_dims, \
|
||||
X_dims, \
|
||||
Y_dims, \
|
||||
SquaredL2NormFunctor<T>(), \
|
||||
T(0), \
|
||||
T(1), \
|
||||
X, \
|
||||
Y, \
|
||||
context); \
|
||||
Sqrt<T, CPUContext>(Y_size, Y, Y, context); \
|
||||
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_REDUCE_L2(float)
|
||||
CAFFE2_SPECIALIZED_REDUCE_L2(double)
|
||||
#undef CAFFE2_SPECIALIZED_REDUCE_L2
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
|
|
|
|||
|
|
@ -1482,7 +1482,7 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) {
|
|||
#define CAFFE2_SPECIALIZED_CUDA_SET(T) \
|
||||
template <> \
|
||||
CAFFE2_CUDA_API void Set<T, CUDAContext>( \
|
||||
const size_t N, const T alpha, T* Y, CUDAContext* context) { \
|
||||
const int N, const T alpha, T* Y, CUDAContext* context) { \
|
||||
if (N == 0) { \
|
||||
return; \
|
||||
} \
|
||||
|
|
@ -1510,7 +1510,7 @@ CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
|
|||
|
||||
template <>
|
||||
CAFFE2_CUDA_EXPORT void Set<at::Half, CUDAContext>(
|
||||
const size_t N,
|
||||
const int N,
|
||||
const at::Half alpha,
|
||||
at::Half* Y,
|
||||
CUDAContext* context) {
|
||||
|
|
@ -3356,27 +3356,19 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDAImpl(
|
|||
|
||||
template <typename T, class Reducer>
|
||||
CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
|
||||
const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const int ndim,
|
||||
const int* X_dims,
|
||||
const int* Y_dims,
|
||||
const Reducer& reducer,
|
||||
const T init,
|
||||
const T alpha,
|
||||
const T* X,
|
||||
T* Y,
|
||||
CUDAContext* context) {
|
||||
CAFFE_ENFORCE_LE(num_axes, num_dims);
|
||||
std::vector<int> Y_dims_vector(dims, dims + num_dims);
|
||||
for (int i = 0; i < num_axes; ++i) {
|
||||
Y_dims_vector[axes[i]] = 1;
|
||||
}
|
||||
const int* X_dims = dims;
|
||||
const int* Y_dims = Y_dims_vector.data();
|
||||
const int X_size =
|
||||
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>());
|
||||
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>());
|
||||
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
|
||||
if (X_size == 0) {
|
||||
Set<T, CUDAContext>(Y_size, alpha * init, Y, context);
|
||||
return;
|
||||
|
|
@ -3385,13 +3377,13 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
|
|||
Set<T, CUDAContext>(Y_size, T(0), Y, context);
|
||||
return;
|
||||
}
|
||||
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {
|
||||
if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
|
||||
Scale<T, T, CUDAContext>(X_size, alpha, X, Y, context);
|
||||
return;
|
||||
}
|
||||
int rows;
|
||||
int cols;
|
||||
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
|
||||
if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
|
||||
RowwiseReduceKernel<T>
|
||||
<<<std::min(rows, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
|
|
@ -3399,7 +3391,7 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
|
|||
context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
|
||||
return;
|
||||
}
|
||||
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
|
||||
if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
|
||||
ColwiseReduceKernel<T>
|
||||
<<<std::min(cols, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
|
|
@ -3407,20 +3399,19 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
|
|||
context->cuda_stream()>>>(rows, cols, reducer, init, alpha, X, Y);
|
||||
return;
|
||||
}
|
||||
std::vector<int> transpose_axes(num_dims);
|
||||
utils::ComputeTransposeAxesForReduceOp(
|
||||
num_dims, num_axes, axes, transpose_axes.data());
|
||||
std::vector<int> axes(ndim);
|
||||
utils::ComputeTransposeAxesForReduceOp(ndim, Y_dims, axes.data());
|
||||
const int outer_size = Y_size;
|
||||
const int inner_size = X_size / Y_size;
|
||||
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
|
||||
num_dims,
|
||||
ndim,
|
||||
ReduceTensorCUDAImpl,
|
||||
T,
|
||||
Reducer,
|
||||
outer_size,
|
||||
inner_size,
|
||||
dims,
|
||||
transpose_axes.data(),
|
||||
X_dims,
|
||||
axes.data(),
|
||||
reducer,
|
||||
init,
|
||||
alpha,
|
||||
|
|
@ -3434,19 +3425,17 @@ CAFFE2_CUDA_EXPORT void ReduceTensorCUDA(
|
|||
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
|
||||
template <> \
|
||||
CAFFE2_CUDA_EXPORT void ReduceMin<T, CUDAContext>( \
|
||||
const int num_dims, \
|
||||
const int* dims, \
|
||||
const int num_axes, \
|
||||
const int* axes, \
|
||||
const int ndim, \
|
||||
const int* X_dims, \
|
||||
const int* Y_dims, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CUDAContext* context) { \
|
||||
ReduceTensorCUDA( \
|
||||
num_dims, \
|
||||
dims, \
|
||||
num_axes, \
|
||||
axes, \
|
||||
ndim, \
|
||||
X_dims, \
|
||||
Y_dims, \
|
||||
cub::Min(), \
|
||||
std::numeric_limits<T>::max(), \
|
||||
alpha, \
|
||||
|
|
@ -3463,19 +3452,17 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
|
|||
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
|
||||
template <> \
|
||||
CAFFE2_CUDA_EXPORT void ReduceMax<T, CUDAContext>( \
|
||||
const int num_dims, \
|
||||
const int* dims, \
|
||||
const int num_axes, \
|
||||
const int* axes, \
|
||||
const int ndim, \
|
||||
const int* X_dims, \
|
||||
const int* Y_dims, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CUDAContext* context) { \
|
||||
ReduceTensorCUDA( \
|
||||
num_dims, \
|
||||
dims, \
|
||||
num_axes, \
|
||||
axes, \
|
||||
ndim, \
|
||||
X_dims, \
|
||||
Y_dims, \
|
||||
cub::Max(), \
|
||||
std::numeric_limits<T>::lowest(), \
|
||||
alpha, \
|
||||
|
|
@ -3489,28 +3476,18 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float)
|
|||
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
|
||||
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX
|
||||
|
||||
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
|
||||
template <> \
|
||||
CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>( \
|
||||
const int num_dims, \
|
||||
const int* dims, \
|
||||
const int num_axes, \
|
||||
const int* axes, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CUDAContext* context) { \
|
||||
ReduceTensorCUDA( \
|
||||
num_dims, \
|
||||
dims, \
|
||||
num_axes, \
|
||||
axes, \
|
||||
cub::Sum(), \
|
||||
T(0), \
|
||||
alpha, \
|
||||
X, \
|
||||
Y, \
|
||||
context); \
|
||||
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
|
||||
template <> \
|
||||
CAFFE2_CUDA_EXPORT void ReduceSum<T, CUDAContext>( \
|
||||
const int ndim, \
|
||||
const int* X_dims, \
|
||||
const int* Y_dims, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CUDAContext* context) { \
|
||||
ReduceTensorCUDA( \
|
||||
ndim, X_dims, Y_dims, cub::Sum(), T(0), alpha, X, Y, context); \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
|
||||
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
|
||||
|
|
@ -3521,23 +3498,23 @@ CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
|
|||
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \
|
||||
template <> \
|
||||
CAFFE2_CUDA_EXPORT void ReduceMean<T, CUDAContext>( \
|
||||
const int num_dims, \
|
||||
const int* dims, \
|
||||
const int num_axes, \
|
||||
const int* axes, \
|
||||
const int ndim, \
|
||||
const int* X_dims, \
|
||||
const int* Y_dims, \
|
||||
const T alpha, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CUDAContext* context) { \
|
||||
int scale = 1; \
|
||||
for (int i = 0; i < num_axes; ++i) { \
|
||||
scale *= dims[axes[i]]; \
|
||||
for (int i = 0; i < ndim; ++i) { \
|
||||
if (Y_dims[i] == 1) { \
|
||||
scale *= X_dims[i]; \
|
||||
} \
|
||||
} \
|
||||
ReduceTensorCUDA( \
|
||||
num_dims, \
|
||||
dims, \
|
||||
num_axes, \
|
||||
axes, \
|
||||
ndim, \
|
||||
X_dims, \
|
||||
Y_dims, \
|
||||
cub::Sum(), \
|
||||
T(0), \
|
||||
alpha / static_cast<T>(scale), \
|
||||
|
|
|
|||
|
|
@ -351,288 +351,6 @@ INSTANTIATE_TEST_CASE_P(
|
|||
GemmBatchedGPUTest,
|
||||
testing::Combine(testing::Bool(), testing::Bool()));
|
||||
|
||||
class ReduceTensorGPUTest : public testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
if (!HasCudaGPU()) {
|
||||
return;
|
||||
}
|
||||
option_.set_device_type(PROTO_CUDA);
|
||||
cuda_context_ = make_unique<CUDAContext>(option_);
|
||||
Blob* blob_x = ws_.CreateBlob("X");
|
||||
Blob* blob_y = ws_.CreateBlob("Y");
|
||||
X_ = BlobGetMutableTensor(blob_x, CUDA);
|
||||
Y_ = BlobGetMutableTensor(blob_y, CUDA);
|
||||
}
|
||||
|
||||
void SetUpData(
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<float>& X_data) {
|
||||
std::vector<int> Y_dims = X_dims;
|
||||
for (const int axis : axes) {
|
||||
Y_dims[axis] = 1;
|
||||
}
|
||||
X_->Resize(X_dims);
|
||||
Y_->Resize(Y_dims);
|
||||
ASSERT_EQ(X_data.size(), X_->numel());
|
||||
cuda_context_->CopyFromCPU<float>(
|
||||
X_data.size(), X_data.data(), X_->mutable_data<float>());
|
||||
}
|
||||
|
||||
void VerifyResult(const std::vector<float>& expected_output) {
|
||||
Blob* blob_y_host = ws_.CreateBlob("Y_host");
|
||||
auto* Y_host = BlobGetMutableTensor(blob_y_host, CPU);
|
||||
Y_host->CopyFrom(*Y_);
|
||||
ASSERT_EQ(expected_output.size(), Y_host->numel());
|
||||
for (std::size_t i = 0; i < expected_output.size(); ++i) {
|
||||
EXPECT_FLOAT_EQ(expected_output[i], Y_host->data<float>()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <class ReduceFunc>
|
||||
void RunRedcueTensorTest(
|
||||
const ReduceFunc& reduce_func,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<float>& X_data,
|
||||
const std::vector<float>& Y_data) {
|
||||
SetUpData(X_dims, axes, X_data);
|
||||
reduce_func(
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
1.0f,
|
||||
X_->data<float>(),
|
||||
Y_->mutable_data<float>(),
|
||||
cuda_context_.get());
|
||||
VerifyResult(Y_data);
|
||||
}
|
||||
|
||||
Workspace ws_;
|
||||
DeviceOption option_;
|
||||
std::unique_ptr<CUDAContext> cuda_context_;
|
||||
Tensor* X_ = nullptr;
|
||||
Tensor* Y_ = nullptr;
|
||||
};
|
||||
|
||||
TEST_F(ReduceTensorGPUTest, ReduceMinGPUTest) {
|
||||
if (!HasCudaGPU()) {
|
||||
return;
|
||||
}
|
||||
const auto& reduce_min = [](const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const float alpha,
|
||||
const float* X,
|
||||
float* Y,
|
||||
CUDAContext* context) {
|
||||
return math::ReduceMin<float, CUDAContext>(
|
||||
num_dims, dims, num_axes, axes, alpha, X, Y, context);
|
||||
};
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{1.0f, 4.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{1.0f, 2.0f, 3.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{1.0f, 5.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{1.0f, 2.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{1.0f, 3.0f});
|
||||
}
|
||||
|
||||
TEST_F(ReduceTensorGPUTest, ReduceMaxGPUTest) {
|
||||
if (!HasCudaGPU()) {
|
||||
return;
|
||||
}
|
||||
const auto& reduce_max = [](const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const float alpha,
|
||||
const float* X,
|
||||
float* Y,
|
||||
CUDAContext* context) {
|
||||
return math::ReduceMax<float, CUDAContext>(
|
||||
num_dims, dims, num_axes, axes, alpha, X, Y, context);
|
||||
};
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{3.0f, 6.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{4.0f, 5.0f, 6.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{4.0f, 8.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{7.0f, 8.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{6.0f, 8.0f});
|
||||
}
|
||||
|
||||
TEST_F(ReduceTensorGPUTest, ReduceSumGPUTest) {
|
||||
if (!HasCudaGPU()) {
|
||||
return;
|
||||
}
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CUDAContext>,
|
||||
{3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f},
|
||||
{6.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CUDAContext>,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{6.0f, 15.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CUDAContext>,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{5.0f, 7.0f, 9.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CUDAContext>,
|
||||
{2, 3},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{21.0f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CUDAContext>,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{10.0f, 26.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CUDAContext>,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{16.0f, 20.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CUDAContext>,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{14.0f, 22.0f});
|
||||
}
|
||||
|
||||
TEST_F(ReduceTensorGPUTest, ReduceMeanGPUTest) {
|
||||
if (!HasCudaGPU()) {
|
||||
return;
|
||||
}
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CUDAContext>,
|
||||
{3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f},
|
||||
{2.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CUDAContext>,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{2.0f, 5.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CUDAContext>,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{2.5f, 3.5f, 4.5f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CUDAContext>,
|
||||
{2, 3},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{3.5f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CUDAContext>,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{2.5f, 6.5f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CUDAContext>,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{4.0f, 5.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CUDAContext>,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{3.5f, 5.5f});
|
||||
}
|
||||
|
||||
class BroadcastGPUTest : public testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
|
|
|
|||
|
|
@ -426,253 +426,6 @@ TEST(MathTest, FloatToHalfConversion) {
|
|||
|
||||
namespace {
|
||||
|
||||
class ReduceTensorTest : public testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
cpu_context_ = make_unique<CPUContext>(option_);
|
||||
}
|
||||
|
||||
template <class ReduceFunc>
|
||||
void RunRedcueTensorTest(
|
||||
const ReduceFunc& reduce_func,
|
||||
const std::vector<int>& X_dims,
|
||||
const std::vector<int>& axes,
|
||||
const std::vector<float>& X_data,
|
||||
const std::vector<float>& Y_data) {
|
||||
std::vector<int> Y_dims = X_dims;
|
||||
for (const int axis : axes) {
|
||||
Y_dims[axis] = 1;
|
||||
}
|
||||
std::vector<int64_t> X_dims_64;
|
||||
std::vector<int64_t> Y_dims_64;
|
||||
std::copy(X_dims.cbegin(), X_dims.cend(), std::back_inserter(X_dims_64));
|
||||
std::copy(Y_dims.cbegin(), Y_dims.cend(), std::back_inserter(Y_dims_64));
|
||||
ReinitializeTensor(&X_, X_dims_64, at::dtype<float>().device(CPU));
|
||||
ReinitializeTensor(&Y_, Y_dims_64, at::dtype<float>().device(CPU));
|
||||
ASSERT_EQ(X_data.size(), X_.numel());
|
||||
cpu_context_->CopyFromCPU<float>(
|
||||
X_data.size(), X_data.data(), X_.mutable_data<float>());
|
||||
reduce_func(
|
||||
X_dims.size(),
|
||||
X_dims.data(),
|
||||
axes.size(),
|
||||
axes.data(),
|
||||
1.0f,
|
||||
X_.data<float>(),
|
||||
Y_.mutable_data<float>(),
|
||||
cpu_context_.get());
|
||||
ASSERT_EQ(Y_data.size(), Y_.numel());
|
||||
for (int i = 0; i < Y_.numel(); ++i) {
|
||||
EXPECT_FLOAT_EQ(Y_data[i], Y_.data<float>()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
DeviceOption option_;
|
||||
std::unique_ptr<CPUContext> cpu_context_;
|
||||
Tensor X_;
|
||||
Tensor Y_;
|
||||
};
|
||||
|
||||
TEST_F(ReduceTensorTest, ReduceMinTest) {
|
||||
const auto& reduce_min = [](const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const float alpha,
|
||||
const float* X,
|
||||
float* Y,
|
||||
CPUContext* context) {
|
||||
return math::ReduceMin<float, CPUContext>(
|
||||
num_dims, dims, num_axes, axes, alpha, X, Y, context);
|
||||
};
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(reduce_min, {3}, {0}, {1.0f, 2.0f, 3.0f}, {1.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{1.0f, 4.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{1.0f, 2.0f, 3.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {1.0f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{1.0f, 5.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{1.0f, 2.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_min,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{1.0f, 3.0f});
|
||||
}
|
||||
|
||||
TEST_F(ReduceTensorTest, ReduceMaxTest) {
|
||||
const auto& reduce_max = [](const int num_dims,
|
||||
const int* dims,
|
||||
const int num_axes,
|
||||
const int* axes,
|
||||
const float alpha,
|
||||
const float* X,
|
||||
float* Y,
|
||||
CPUContext* context) {
|
||||
return math::ReduceMax<float, CPUContext>(
|
||||
num_dims, dims, num_axes, axes, alpha, X, Y, context);
|
||||
};
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(reduce_max, {3}, {0}, {1.0f, 2.0f, 3.0f}, {3.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{3.0f, 6.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{4.0f, 5.0f, 6.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max, {2, 3}, {0, 1}, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, {6.0f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{4.0f, 8.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{7.0f, 8.0f});
|
||||
RunRedcueTensorTest(
|
||||
reduce_max,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{6.0f, 8.0f});
|
||||
}
|
||||
|
||||
TEST_F(ReduceTensorTest, ReduceSumTest) {
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CPUContext>, {3}, {0}, {1.0f, 2.0f, 3.0f}, {6.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CPUContext>,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{6.0f, 15.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CPUContext>,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{5.0f, 7.0f, 9.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CPUContext>,
|
||||
{2, 3},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{21.0f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CPUContext>,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{10.0f, 26.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CPUContext>,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{16.0f, 20.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceSum<float, CPUContext>,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{14.0f, 22.0f});
|
||||
}
|
||||
|
||||
TEST_F(ReduceTensorTest, ReduceMeanTest) {
|
||||
// Test for 1D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CPUContext>,
|
||||
{3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f},
|
||||
{2.0f});
|
||||
|
||||
// Test for 2D Tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CPUContext>,
|
||||
{2, 3},
|
||||
{1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{2.0f, 5.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CPUContext>,
|
||||
{2, 3},
|
||||
{0},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{2.5f, 3.5f, 4.5f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CPUContext>,
|
||||
{2, 3},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
|
||||
{3.5f});
|
||||
|
||||
// Test for 3D tensor.
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CPUContext>,
|
||||
{2, 2, 2},
|
||||
{1, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{2.5f, 6.5f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CPUContext>,
|
||||
{2, 2, 2},
|
||||
{0, 1},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{4.0f, 5.0f});
|
||||
RunRedcueTensorTest(
|
||||
math::ReduceMean<float, CPUContext>,
|
||||
{2, 2, 2},
|
||||
{0, 2},
|
||||
{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
|
||||
{3.5f, 5.5f});
|
||||
}
|
||||
|
||||
class BroadcastTest : public testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user