mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: 1) Adding MKL/AVX2 based implementation into perfkernels. This implementation is similar to caffe2/operators/batch_box_cox_op.cc 2) Migrating batch_box_cox_op of caffe2 use this implementation Test Plan: CI Differential Revision: D40208074 Pull Request resolved: https://github.com/pytorch/pytorch/pull/86569 Approved by: https://github.com/hyuen
114 lines
2.5 KiB
C++
114 lines
2.5 KiB
C++
#include "caffe2/perfkernels/common.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <cmath>
|
|
|
|
namespace caffe2 {
|
|
|
|
namespace {
|
|
template <typename T>
|
|
void BoxCoxNaive(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
const T* data_ptr,
|
|
const T* __restrict lambda1_ptr,
|
|
const T* __restrict lambda2_ptr,
|
|
T* output_ptr) {
|
|
constexpr T k_eps = static_cast<T>(1e-6);
|
|
|
|
for (int64_t i = 0; i < N; i++) {
|
|
for (int64_t j = 0; j < D; j++, data_ptr++, output_ptr++) {
|
|
T lambda1_v = lambda1_ptr[j];
|
|
T lambda2_v = lambda2_ptr[j];
|
|
T tmp = std::max(*data_ptr + lambda2_v, k_eps);
|
|
if (lambda1_v == 0) {
|
|
*output_ptr = std::log(tmp);
|
|
} else {
|
|
T lambda_1 = 1 / lambda1_v;
|
|
T pow = std::pow(tmp, lambda1_v);
|
|
*output_ptr = lambda_1 * pow - lambda_1;
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
#if defined(CAFFE2_PERF_WITH_AVX2) && defined(CAFFE2_PERF_USE_MKL)
|
|
namespace details {
|
|
template <typename T>
|
|
void compute_batch_box_cox__avx2_fma(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const T* data_ptr,
|
|
const T* __restrict lambda1_ptr,
|
|
const T* __restrict lambda2_ptr,
|
|
T* output_ptr);
|
|
|
|
extern template
|
|
void compute_batch_box_cox__avx2_fma<float>(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const float* self_data,
|
|
const float* __restrict lambda1_data,
|
|
const float* __restrict lambda2_data,
|
|
float* output_data);
|
|
|
|
extern template
|
|
void compute_batch_box_cox__avx2_fma<double>(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const double* self_data,
|
|
const double* __restrict lambda1_data,
|
|
const double* __restrict lambda2_data,
|
|
double* output_data);
|
|
} // namespace detail
|
|
#endif
|
|
|
|
template <typename T>
|
|
void compute_batch_box_cox(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const T* data,
|
|
const T* lambda1_data,
|
|
const T* lambda2_data,
|
|
T* output_data) {
|
|
#ifdef CAFFE2_PERF_WITH_AVX2
|
|
AVX2_FMA_DO(
|
|
details::compute_batch_box_cox,
|
|
N,
|
|
D,
|
|
block_size,
|
|
data,
|
|
lambda1_data,
|
|
lambda2_data,
|
|
output_data);
|
|
#endif
|
|
BoxCoxNaive<T>(N, D, data, lambda1_data, lambda2_data, output_data);
|
|
}
|
|
|
|
template void compute_batch_box_cox<float>(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const float* data,
|
|
const float* lambda1_data,
|
|
const float* lambda2_data,
|
|
float* output_data);
|
|
|
|
template void compute_batch_box_cox<double>(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const double* data,
|
|
const double* lambda1_data,
|
|
const double* lambda2_data,
|
|
double* output_data);
|
|
|
|
} // namespace caffe2
|