pytorch/caffe2/utils/math/transpose.cc

#include "caffe2/utils/math/transpose.h"

#include <algorithm>
#include <functional>
#include <limits>
#include <numeric>

#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL

#ifdef CAFFE2_USE_HPTT
#include <hptt.h>
#endif // CAFFE2_USE_HPTT

#include "caffe2/core/context.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math/utils.h"

namespace caffe2 {
namespace math {

namespace {

template <typename TIndex, typename TData>
void Transpose2D(
    const TIndex rows,
    const TIndex cols,
    const TData* X,
    TData* Y) {
  EigenMatrixMap<TData>(Y, rows, cols) =
      ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
}

#ifdef CAFFE2_USE_MKL

#define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc)                   \
  template <>                                                           \
  void Transpose2D<TIndex, TData>(                                      \
      const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \
    MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows);          \
  }
DELEGATE_TRANSPOSE_2D(std::int32_t, float, mkl_somatcopy);
DELEGATE_TRANSPOSE_2D(std::int64_t, float, mkl_somatcopy);
DELEGATE_TRANSPOSE_2D(std::int32_t, double, mkl_domatcopy);
DELEGATE_TRANSPOSE_2D(std::int64_t, double, mkl_domatcopy);
#undef DELEGATE_TRANSPOSE_2D

#endif // CAFFE2_USE_MKL

#ifdef CAFFE2_USE_HPTT

template <typename TIndex, typename TData>
bool TransposeByHPTT(
    const int ndim,
    const TIndex* dims,
    const int* axes,
    const TData* X,
    TData* Y) {
  for (int i = 0; i < ndim; ++i) {
    if (dims[i] <= 0 || dims[i] > std::numeric_limits<int>::max()) {
      return false;
    }
  }

  std::vector<int> axes_cm(ndim);
  std::vector<int> dims_cm(ndim);
  // Convert row-major index to column-major.
  const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
  for (int i = 0; i < ndim; ++i) {
    axes_cm[i] = cm_fn(axes[cm_fn(i)]);
    dims_cm[i] = dims[cm_fn(i)];
  }
  auto plan = hptt::create_plan(
      axes_cm.data(),
      ndim,
      TData(1),
      X,
      dims_cm.data(),
      nullptr,
      TData(0),
      Y,
      nullptr,
      hptt::ESTIMATE,
      1 /* num_threads */);
  if (plan == nullptr) {
    return false;
  }
  plan->execute();
  return true;
}

#endif // CAFFE2_USE_HPTT

template <typename TIndex, typename TData>
void TransposeND(
    const int ndim,
    const TIndex* dims,
    const int* axes,
    const TData* X,
    TData* Y) {
  std::vector<TIndex> Y_dims(ndim);
  for (int i = 0; i < ndim; ++i) {
    Y_dims[i] = dims[axes[i]];
  }
  // Measure amount of contiguous data we can copy at once
  int pivot = ndim - 1;
  TIndex block_size = 1;
  for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
    block_size *= Y_dims[pivot];
  }
  ++pivot;
  const TIndex num_blocks = std::accumulate(
      Y_dims.cbegin(),
      Y_dims.cbegin() + pivot,
      TIndex(1),
      std::multiplies<TIndex>());
  std::vector<TIndex> X_strides(pivot);
  utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
  std::vector<TIndex> index(pivot, 0);
  for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
    const TIndex X_index = std::inner_product(
        X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
    if (block_size == 1) {
      Y[Y_index] = X[X_index];
    } else {
      std::memcpy(
          Y + block_size * Y_index,
          X + block_size * X_index,
          block_size * sizeof(TData));
    }
    utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
  }
}

template <typename TIndex, typename TData>
void TransposeImpl(
    const int ndim,
    const TIndex* dims,
    const int* axes,
    const TData* X,
    TData* Y) {
  const TIndex size =
      std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
  if (size == 0) {
    return;
  }
  if (utils::IsIdentityPermutation(ndim, axes)) {
    std::memcpy(Y, X, size * sizeof(TData));
    return;
  }
  if (utils::IsBatchTranspose2D(ndim, axes)) {
    const TIndex H = dims[ndim - 2];
    const TIndex W = dims[ndim - 1];
    const TIndex N = size / (H * W);
    for (TIndex i = 0; i < N; ++i) {
      Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
    }
    return;
  }
  TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
}

#ifdef CAFFE2_USE_HPTT

#define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData)                \
  template <>                                                           \
  void TransposeImpl<TIndex, TData>(                                    \
      const int ndim,                                                   \
      const TIndex* dims,                                               \
      const int* axes,                                                  \
      const TData* X,                                                   \
      TData* T) {                                                       \
    const TIndex size = std::accumulate(                                \
        dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());       \
    if (size == 0) {                                                    \
      return;                                                           \
    }                                                                   \
    if (utils::IsIdentityPermutation(ndim, axes)) {                     \
      std::memcpy(Y, X, size * sizeof(TData));                          \
      return;                                                           \
    }                                                                   \
    if (TransposeByHPTT(ndim, dims, axes, X, Y)) {                      \
      return;                                                           \
    }                                                                   \
    if (utils::IsBatchTranspose2D(ndim, axes)) {                        \
      const TIndex H = dims[ndim - 2];                                  \
      const TIndex W = dims[ndim - 1];                                  \
      const TIndex N = size / (H * W);                                  \
      for (TIndex i = 0; i < N; ++i) {                                  \
        Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \
      }                                                                 \
      return;                                                           \
    }                                                                   \
    TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);                 \
  }
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, float)
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, float)
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, double)
CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, double)
#undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL

#endif // CAFFE2_USE_HPTT

} // namespace

#define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData)       \
  template <>                                             \
  C10_EXPORT void Transpose<TIndex, TData, CPUContext>(   \
      const int ndim,                                     \
      const TIndex* dims,                                 \
      const int* axes,                                    \
      const TData* X,                                     \
      TData* Y,                                           \
      CPUContext* /* context */) {                        \
    TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \
  }
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, float)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, float)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, double)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, double)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int32_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int32_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int64_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int64_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint8_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint8_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint16_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint16_t)
#undef CAFFE2_SPECIALIZED_TRANSPOSE

#define CAFFE2_SPECIALIZED_NCHW2NHWC(T)                    \
  template <>                                              \
  C10_EXPORT void NCHW2NHWC<T, CPUContext>(                \
      const int N,                                         \
      const int C,                                         \
      const int HxW,                                       \
      const T* X,                                          \
      T* Y,                                                \
      CPUContext* /* context */) {                         \
    const int stride = C * HxW;                            \
    for (int i = 0; i < N; ++i) {                          \
      Transpose2D(C, HxW, X + i * stride, Y + i * stride); \
    }                                                      \
  }
CAFFE2_SPECIALIZED_NCHW2NHWC(float)
#undef CAFFE2_SPECIALIZED_NCHW2NHWC

#define CAFFE2_SPECIALIZED_NHWC2NCHW(T)                    \
  template <>                                              \
  C10_EXPORT void NHWC2NCHW<T, CPUContext>(                \
      const int N,                                         \
      const int C,                                         \
      const int HxW,                                       \
      const T* X,                                          \
      T* Y,                                                \
      CPUContext* /* context */) {                         \
    const int stride = HxW * C;                            \
    for (int i = 0; i < N; ++i) {                          \
      Transpose2D(HxW, C, X + i * stride, Y + i * stride); \
    }                                                      \
  }
CAFFE2_SPECIALIZED_NHWC2NCHW(float)
#undef CAFFE2_SPECIALIZED_NHWC2NCHW

} // namespace math
} // namespace caffe2