mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/34625 These templated function calls are not specifying the template args correctly. The first arg is the index type, not the array data type. That means, right now it's using `T` as the index type as well, which will break if we do a template specialization for uint8_t. If we omit both, it will correctly infer that the index type is `int` and the data type is `T`. Reviewed By: BIT-silence Differential Revision: D20358728 fbshipit-source-id: 8cbd8eeb14bce602c02eb6fce2cc141f0121fa24
268 lines
9.7 KiB
C++
268 lines
9.7 KiB
C++
#include "caffe2/utils/math/transpose.h"
|
|
|
|
#include <algorithm>
|
|
#include <functional>
|
|
#include <limits>
|
|
#include <numeric>
|
|
|
|
#ifdef CAFFE2_USE_MKL
|
|
#include <mkl.h>
|
|
#endif // CAFFE2_USE_MKL
|
|
|
|
#ifdef CAFFE2_USE_HPTT
|
|
#include <hptt.h>
|
|
#endif // CAFFE2_USE_HPTT
|
|
|
|
#include "caffe2/core/context.h"
|
|
#include "caffe2/utils/eigen_utils.h"
|
|
#include "caffe2/utils/math/utils.h"
|
|
|
|
namespace caffe2 {
|
|
namespace math {
|
|
|
|
namespace {
|
|
|
|
template <typename TIndex, typename TData>
|
|
void Transpose2D(
|
|
const TIndex rows,
|
|
const TIndex cols,
|
|
const TData* X,
|
|
TData* Y) {
|
|
EigenMatrixMap<TData>(Y, rows, cols) =
|
|
ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
|
|
}
|
|
|
|
#ifdef CAFFE2_USE_MKL
|
|
|
|
#define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc) \
|
|
template <> \
|
|
void Transpose2D<TIndex, TData>( \
|
|
const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \
|
|
MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows); \
|
|
}
|
|
DELEGATE_TRANSPOSE_2D(std::int32_t, float, mkl_somatcopy);
|
|
DELEGATE_TRANSPOSE_2D(std::int64_t, float, mkl_somatcopy);
|
|
DELEGATE_TRANSPOSE_2D(std::int32_t, double, mkl_domatcopy);
|
|
DELEGATE_TRANSPOSE_2D(std::int64_t, double, mkl_domatcopy);
|
|
#undef DELEGATE_TRANSPOSE_2D
|
|
|
|
#endif // CAFFE2_USE_MKL
|
|
|
|
#ifdef CAFFE2_USE_HPTT
|
|
|
|
template <typename TIndex, typename TData>
|
|
bool TransposeByHPTT(
|
|
const int ndim,
|
|
const TIndex* dims,
|
|
const int* axes,
|
|
const TData* X,
|
|
TData* Y) {
|
|
for (int i = 0; i < ndim; ++i) {
|
|
if (dims[i] <= 0 || dims[i] > std::numeric_limits<int>::max()) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
std::vector<int> axes_cm(ndim);
|
|
std::vector<int> dims_cm(ndim);
|
|
// Convert row-major index to column-major.
|
|
const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
|
|
for (int i = 0; i < ndim; ++i) {
|
|
axes_cm[i] = cm_fn(axes[cm_fn(i)]);
|
|
dims_cm[i] = dims[cm_fn(i)];
|
|
}
|
|
auto plan = hptt::create_plan(
|
|
axes_cm.data(),
|
|
ndim,
|
|
TData(1),
|
|
X,
|
|
dims_cm.data(),
|
|
nullptr,
|
|
TData(0),
|
|
Y,
|
|
nullptr,
|
|
hptt::ESTIMATE,
|
|
1 /* num_threads */);
|
|
if (plan == nullptr) {
|
|
return false;
|
|
}
|
|
plan->execute();
|
|
return true;
|
|
}
|
|
|
|
#endif // CAFFE2_USE_HPTT
|
|
|
|
template <typename TIndex, typename TData>
|
|
void TransposeND(
|
|
const int ndim,
|
|
const TIndex* dims,
|
|
const int* axes,
|
|
const TData* X,
|
|
TData* Y) {
|
|
std::vector<TIndex> Y_dims(ndim);
|
|
for (int i = 0; i < ndim; ++i) {
|
|
Y_dims[i] = dims[axes[i]];
|
|
}
|
|
// Measure amount of contiguous data we can copy at once
|
|
int pivot = ndim - 1;
|
|
TIndex block_size = 1;
|
|
for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
|
|
block_size *= Y_dims[pivot];
|
|
}
|
|
++pivot;
|
|
const TIndex num_blocks = std::accumulate(
|
|
Y_dims.cbegin(),
|
|
Y_dims.cbegin() + pivot,
|
|
TIndex(1),
|
|
std::multiplies<TIndex>());
|
|
std::vector<TIndex> X_strides(pivot);
|
|
utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
|
|
std::vector<TIndex> index(pivot, 0);
|
|
for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
|
|
const TIndex X_index = std::inner_product(
|
|
X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
|
|
if (block_size == 1) {
|
|
Y[Y_index] = X[X_index];
|
|
} else {
|
|
std::memcpy(
|
|
Y + block_size * Y_index,
|
|
X + block_size * X_index,
|
|
block_size * sizeof(TData));
|
|
}
|
|
utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
|
|
}
|
|
}
|
|
|
|
template <typename TIndex, typename TData>
|
|
void TransposeImpl(
|
|
const int ndim,
|
|
const TIndex* dims,
|
|
const int* axes,
|
|
const TData* X,
|
|
TData* Y) {
|
|
const TIndex size =
|
|
std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
|
|
if (size == 0) {
|
|
return;
|
|
}
|
|
if (utils::IsIdentityPermutation(ndim, axes)) {
|
|
std::memcpy(Y, X, size * sizeof(TData));
|
|
return;
|
|
}
|
|
if (utils::IsBatchTranspose2D(ndim, axes)) {
|
|
const TIndex H = dims[ndim - 2];
|
|
const TIndex W = dims[ndim - 1];
|
|
const TIndex N = size / (H * W);
|
|
for (TIndex i = 0; i < N; ++i) {
|
|
Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
|
|
}
|
|
return;
|
|
}
|
|
TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
|
|
}
|
|
|
|
#ifdef CAFFE2_USE_HPTT
|
|
|
|
#define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData) \
|
|
template <> \
|
|
void TransposeImpl<TIndex, TData>( \
|
|
const int ndim, \
|
|
const TIndex* dims, \
|
|
const int* axes, \
|
|
const TData* X, \
|
|
TData* T) { \
|
|
const TIndex size = std::accumulate( \
|
|
dims, dims + ndim, TIndex(1), std::multiplies<TIndex>()); \
|
|
if (size == 0) { \
|
|
return; \
|
|
} \
|
|
if (utils::IsIdentityPermutation(ndim, axes)) { \
|
|
std::memcpy(Y, X, size * sizeof(TData)); \
|
|
return; \
|
|
} \
|
|
if (TransposeByHPTT(ndim, dims, axes, X, Y)) { \
|
|
return; \
|
|
} \
|
|
if (utils::IsBatchTranspose2D(ndim, axes)) { \
|
|
const TIndex H = dims[ndim - 2]; \
|
|
const TIndex W = dims[ndim - 1]; \
|
|
const TIndex N = size / (H * W); \
|
|
for (TIndex i = 0; i < N; ++i) { \
|
|
Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \
|
|
} \
|
|
return; \
|
|
} \
|
|
TransposeND<TIndex, TData>(ndim, dims, axes, X, Y); \
|
|
}
|
|
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, float)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, float)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, double)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, double)
|
|
#undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL
|
|
|
|
#endif // CAFFE2_USE_HPTT
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData) \
|
|
template <> \
|
|
C10_EXPORT void Transpose<TIndex, TData, CPUContext>( \
|
|
const int ndim, \
|
|
const TIndex* dims, \
|
|
const int* axes, \
|
|
const TData* X, \
|
|
TData* Y, \
|
|
CPUContext* /* context */) { \
|
|
TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \
|
|
}
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, float)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, float)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, double)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, double)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int32_t)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int32_t)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int64_t)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int64_t)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint8_t)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint8_t)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint16_t)
|
|
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint16_t)
|
|
#undef CAFFE2_SPECIALIZED_TRANSPOSE
|
|
|
|
#define CAFFE2_SPECIALIZED_NCHW2NHWC(T) \
|
|
template <> \
|
|
C10_EXPORT void NCHW2NHWC<T, CPUContext>( \
|
|
const int N, \
|
|
const int C, \
|
|
const int HxW, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CPUContext* /* context */) { \
|
|
const int stride = C * HxW; \
|
|
for (int i = 0; i < N; ++i) { \
|
|
Transpose2D(C, HxW, X + i * stride, Y + i * stride); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_NCHW2NHWC(float)
|
|
#undef CAFFE2_SPECIALIZED_NCHW2NHWC
|
|
|
|
#define CAFFE2_SPECIALIZED_NHWC2NCHW(T) \
|
|
template <> \
|
|
C10_EXPORT void NHWC2NCHW<T, CPUContext>( \
|
|
const int N, \
|
|
const int C, \
|
|
const int HxW, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CPUContext* /* context */) { \
|
|
const int stride = HxW * C; \
|
|
for (int i = 0; i < N; ++i) { \
|
|
Transpose2D(HxW, C, X + i * stride, Y + i * stride); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_NHWC2NCHW(float)
|
|
#undef CAFFE2_SPECIALIZED_NHWC2NCHW
|
|
|
|
} // namespace math
|
|
} // namespace caffe2
|