mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-07 12:21:27 +01:00
Summary: Usage of fast math in BatchBoxCox kernel provided different math results between dev and optimized versions which cause few internal test to fail. For now disabling the compiler optimized version and relying on ATEN vectors Differential Revision: D41211784 Pull Request resolved: https://github.com/pytorch/pytorch/pull/88875 Approved by: https://github.com/hyuen
400 lines
11 KiB
C++
400 lines
11 KiB
C++
#include <immintrin.h>
|
|
#ifdef CAFFE2_PERF_USE_MKL
|
|
#include <c10/util/irange.h>
|
|
#include <caffe2/perfkernels/common.h>
|
|
#include <folly/SingletonThreadLocal.h>
|
|
|
|
#include "vectorizer.h"
|
|
|
|
// Enable compiler vectorized version only if numerical consistency is not
|
|
// required between dev and opt versions - disabled for now
|
|
#ifndef FAST_VECTORIZED_KERNEL
|
|
#define CPU_CAPABILITY_AVX2
|
|
#include <ATen/cpu/vec/vec.h>
|
|
|
|
namespace at::vec {
|
|
|
|
// Implements the vectorized version of std::max() operation,
|
|
// which DOESNOT propagates NaN for second argument
|
|
template <typename scalar_t>
|
|
Vectorized<scalar_t> max(const Vectorized<scalar_t>& a, const Vectorized<scalar_t>& b);
|
|
|
|
template <>
|
|
Vectorized<double> max(const Vectorized<double>& a, const Vectorized<double>& b) {
|
|
// std::max(NaN, nonNan) -> NaN
|
|
return _mm256_max_pd(b, a);
|
|
}
|
|
|
|
template <>
|
|
Vectorized<float> max(const Vectorized<float>& a, const Vectorized<float>& b) {
|
|
// std::max(NaN, nonNan) -> NaN
|
|
return _mm256_max_ps(b, a);
|
|
}
|
|
|
|
// Implements recieprocal method based on newton-rapson method
|
|
// 1. user RCP approximiation
|
|
// 2. update with RCP = RCP * (2 - X * RCP)
|
|
template <typename scalar_t>
|
|
Vectorized<scalar_t> fast_recieprocal(const Vectorized<scalar_t>& b);
|
|
template <typename scalar_t>
|
|
scalar_t fast_recieprocal(scalar_t b);
|
|
|
|
template<>
|
|
Vectorized<float> fast_recieprocal(const Vectorized<float>& b) {
|
|
auto minus2 = _mm256_set1_ps(-2.f);
|
|
auto rcp = _mm256_rcp_ps(b);
|
|
rcp = _mm256_mul_ps(rcp, _mm256_fnmsub_ps(rcp, b, minus2));
|
|
rcp = _mm256_mul_ps(rcp, _mm256_fnmsub_ps(rcp, b, minus2));
|
|
return rcp;
|
|
}
|
|
|
|
template <>
|
|
float fast_recieprocal(float b) {
|
|
auto minus2 = _mm_set_ss(-2.f);
|
|
auto b_reg = _mm_set_ss(b);
|
|
auto rcp = _mm_rcp_ss(b_reg);
|
|
rcp = _mm_mul_ss(rcp, _mm_fnmsub_ss(rcp, b_reg, minus2));
|
|
rcp = _mm_mul_ss(rcp, _mm_fnmsub_ss(rcp, b_reg, minus2));
|
|
return _mm_cvtss_f32(rcp);
|
|
}
|
|
|
|
template<>
|
|
Vectorized<double> fast_recieprocal(const Vectorized<double>& b) {
|
|
return b.reciprocal();
|
|
}
|
|
|
|
template <>
|
|
double fast_recieprocal(double b) {
|
|
return 1./b;
|
|
}
|
|
|
|
}
|
|
#endif
|
|
|
|
#include <cstdint>
|
|
#include <cmath>
|
|
#include <vector>
|
|
|
|
#include <mkl.h>
|
|
|
|
namespace caffe2::details {
|
|
|
|
// MKL VML function templates.
|
|
template <typename T>
|
|
void PackV(const int N, const T* a, const int* ia, T* y);
|
|
template <typename T>
|
|
void UnpackV(const int N, const T* a, T* y, const int* iy);
|
|
|
|
#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \
|
|
template <> \
|
|
void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
|
|
OriginalFunc(N, a, ia, y); \
|
|
}
|
|
DELEGATE_PACKV_FUNCTION(float, vsPackV)
|
|
DELEGATE_PACKV_FUNCTION(double, vdPackV)
|
|
#undef DELEGATE_PACKV_FUNCTION
|
|
|
|
#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \
|
|
template <> \
|
|
void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
|
|
OriginalFunc(N, a, y, iy); \
|
|
}
|
|
DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
|
|
DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
|
|
#undef DELEGATE_UNPACKV_FUNCTION
|
|
|
|
#ifndef FAST_VECTORIZED_KERNEL
|
|
template <typename T>
|
|
void box_cox_zero_lambda(
|
|
size_t D,
|
|
const T* const self_data,
|
|
const T* const lambda2_data,
|
|
T k_eps,
|
|
T* const output_data) {
|
|
int j = 0;
|
|
using Vec = at::vec::Vectorized<T>;
|
|
constexpr int64_t VLEN = Vec::size();
|
|
auto k_eps_vec = Vec(k_eps);
|
|
for(; j + VLEN < D; j += VLEN) {
|
|
auto data = Vec::loadu(self_data + j);
|
|
auto lambda2 = Vec::loadu(lambda2_data + j);
|
|
auto sum = data + lambda2;
|
|
auto max = at::vec::max(sum, k_eps_vec);
|
|
auto res = max.log();
|
|
res.store(output_data + j);
|
|
}
|
|
for ( ;j < D; ++j) {
|
|
auto sum = self_data[j] + lambda2_data[j];
|
|
auto max = std::max(sum, k_eps);
|
|
output_data[j] = std::log(max);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void box_cox_nonzero_lambda(
|
|
int64_t D,
|
|
const T* data_ptr,
|
|
const T* lambda1_ptr,
|
|
const T* lambda2_ptr,
|
|
T k_eps,
|
|
T* out) {
|
|
|
|
int j = 0;
|
|
using Vec = at::vec::Vectorized<T>;
|
|
constexpr int64_t VLEN = Vec::size();
|
|
auto k_eps_vec = Vec(k_eps);
|
|
for(; j + VLEN < D; j += VLEN) {
|
|
auto data = Vec::loadu(data_ptr + j);
|
|
auto lambda2 = Vec::loadu(lambda2_ptr + j);
|
|
auto sum = data + lambda2;
|
|
auto max = at::vec::max(sum, k_eps_vec);
|
|
auto lambda1 = Vec::loadu(lambda1_ptr + j);
|
|
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
|
|
auto pow = max.pow(lambda1);
|
|
auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
|
|
res.store(out + j);
|
|
}
|
|
for ( ;j < D; ++j) {
|
|
auto sum = data_ptr[j] + lambda2_ptr[j];
|
|
auto max = std::max(sum, k_eps);
|
|
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
|
|
auto pow = std::pow(max, lambda1_ptr[j]);
|
|
out[j] = pow * lambda_over_1 - lambda_over_1;
|
|
}
|
|
}
|
|
#else
|
|
template <typename T>
|
|
void box_cox_zero_lambda(
|
|
size_t D,
|
|
const T* const self_data,
|
|
const T* const lambda2_data,
|
|
T k_eps,
|
|
T* const output_data) {
|
|
VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
|
|
auto sum = self_data[j] + lambda2_data[j];
|
|
auto max = std::max(sum, k_eps);
|
|
output_data[j] = std::log(max);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void box_cox_nonzero_lambda(
|
|
int64_t D,
|
|
const T* data_ptr,
|
|
const T* lambda1_ptr,
|
|
const T* lambda2_ptr,
|
|
T k_eps,
|
|
T* out) {
|
|
|
|
VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
|
|
FAST_MATH
|
|
auto sum = data_ptr[j] + lambda2_ptr[j];
|
|
auto max = std::max(sum, k_eps);
|
|
auto lamda1 = lambda1_ptr[j];
|
|
auto lambda_over_1 = 1 / lamda1;
|
|
if constexpr (std::is_same<T, float>::value) {
|
|
lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
|
|
lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
|
|
}
|
|
auto pow = std::pow(max, lamda1);
|
|
out[j] = pow * lambda_over_1 - lambda_over_1;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
template <typename T>
|
|
void box_cox_mixed_lambda(
|
|
const T* const self_data,
|
|
const std::vector<int>& nonzeros,
|
|
const std::vector<int>& zeros,
|
|
const T* const lambda1,
|
|
const T* const lambda2,
|
|
const T* const lambda2_z_,
|
|
T k_eps,
|
|
T* const buffer,
|
|
T* const output_data) {
|
|
PackV(nonzeros.size(), self_data, nonzeros.data(), buffer);
|
|
box_cox_nonzero_lambda<T>(
|
|
nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
|
|
UnpackV(nonzeros.size(), buffer, output_data, nonzeros.data());
|
|
|
|
PackV(zeros.size(), self_data, zeros.data(), buffer);
|
|
box_cox_zero_lambda<T>(
|
|
zeros.size(), buffer, lambda2_z_, k_eps, buffer);
|
|
UnpackV(zeros.size(), buffer, output_data, zeros.data());
|
|
}
|
|
|
|
template <typename T>
|
|
void TileArrayIntoVector(
|
|
const T* const a,
|
|
const size_t D,
|
|
const int K,
|
|
std::vector<T>& b) {
|
|
b.resize(K * D);
|
|
for (const auto k : c10::irange(K)) {
|
|
std::copy(a, a + D, b.begin() + k * D);
|
|
}
|
|
}
|
|
|
|
void TileIndicesInPlace(std::vector<int>& v, const std::size_t D, const std::size_t K) {
|
|
auto n = v.size();
|
|
v.resize(K * n);
|
|
for (const auto k : c10::irange(1, K)) {
|
|
for (const auto j : c10::irange(n)) {
|
|
v[k * n + j] = v[j] + k * D;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void compute_batch_box_cox__avx2_fma(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const T* self_data,
|
|
const T* __restrict lambda1_data,
|
|
const T* __restrict lambda2_data,
|
|
T* output_data) {
|
|
constexpr T k_eps = static_cast<T>(1e-6);
|
|
|
|
FOLLY_DECLARE_REUSED(zeros, std::vector<int>);
|
|
FOLLY_DECLARE_REUSED(nonzeros, std::vector<int>);
|
|
// Don't bother calling reserve; calls after the first will get a
|
|
// correctly-sized allocation anyway.
|
|
for (const auto j : c10::irange(D)) {
|
|
if (lambda1_data[j] == 0) {
|
|
zeros.push_back(j);
|
|
} else {
|
|
nonzeros.push_back(j);
|
|
}
|
|
}
|
|
|
|
// Process K rows at a time for effective vectorization with small rows.
|
|
const auto K = std::min(N, (block_size + D - 1) / D);
|
|
|
|
FOLLY_DECLARE_REUSED(lambda1_, std::vector<T>);
|
|
FOLLY_DECLARE_REUSED(lambda2_, std::vector<T>);
|
|
FOLLY_DECLARE_REUSED(lambda2_z_, std::vector<T>);
|
|
|
|
if (nonzeros.size() == D) {
|
|
// ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0
|
|
size_t i = 0;
|
|
if (K > 1) {
|
|
TileArrayIntoVector(lambda1_data, D, K, lambda1_);
|
|
TileArrayIntoVector(lambda2_data, D, K, lambda2_);
|
|
DCHECK_EQ(K * D, lambda1_.size());
|
|
DCHECK_EQ(K * D, lambda2_.size());
|
|
for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
|
|
box_cox_nonzero_lambda<T>(
|
|
K * D,
|
|
self_data,
|
|
lambda1_.data(),
|
|
lambda2_.data(),
|
|
k_eps,
|
|
output_data);
|
|
}
|
|
}
|
|
for (; i < N; i++, self_data += D, output_data += D) {
|
|
box_cox_nonzero_lambda<T>(
|
|
D, self_data, lambda1_data, lambda2_data, k_eps, output_data);
|
|
}
|
|
} else if (zeros.size() == D) {
|
|
// ln(x + lambda2), if lambda1 == 0
|
|
size_t i = 0;
|
|
if (K > 1) {
|
|
TileArrayIntoVector(lambda2_data, D, K, lambda2_z_);
|
|
DCHECK_EQ(K * D, lambda2_z_.size());
|
|
for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
|
|
box_cox_zero_lambda<T>(
|
|
K * D, self_data, lambda2_z_.data(), k_eps, output_data);
|
|
}
|
|
}
|
|
for (; i < N; i++, self_data += D, output_data += D) {
|
|
box_cox_zero_lambda<T>(
|
|
D, self_data, lambda2_data, k_eps, output_data);
|
|
}
|
|
} else {
|
|
// mix zeros and nonzeros
|
|
const size_t n = nonzeros.size();
|
|
if (K > 1) {
|
|
TileIndicesInPlace(nonzeros, 0, K);
|
|
TileIndicesInPlace(zeros, 0, K);
|
|
}
|
|
|
|
FOLLY_DECLARE_REUSED(buffer, std::vector<T>);
|
|
|
|
buffer.resize(std::max(nonzeros.size(), zeros.size()));
|
|
lambda1_.resize(nonzeros.size());
|
|
lambda2_.resize(nonzeros.size());
|
|
lambda2_z_.resize(zeros.size());
|
|
PackV(nonzeros.size(), lambda1_data, nonzeros.data(), lambda1_.data());
|
|
PackV(nonzeros.size(), lambda2_data, nonzeros.data(), lambda2_.data());
|
|
PackV(zeros.size(), lambda2_data, zeros.data(), lambda2_z_.data());
|
|
|
|
size_t i = 0;
|
|
if (K > 1) {
|
|
// Truncate to original size, and re-tile with offsets this time.
|
|
nonzeros.resize(n);
|
|
DCHECK_GT(D, n);
|
|
zeros.resize(D - n);
|
|
TileIndicesInPlace(nonzeros, D, K);
|
|
TileIndicesInPlace(zeros, D, K);
|
|
DCHECK_EQ(nonzeros.size(), lambda1_.size());
|
|
DCHECK_EQ(nonzeros.size(), lambda2_.size());
|
|
DCHECK_EQ(zeros.size(), lambda2_z_.size());
|
|
|
|
for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
|
|
box_cox_mixed_lambda<T>(
|
|
self_data,
|
|
nonzeros,
|
|
zeros,
|
|
lambda1_.data(),
|
|
lambda2_.data(),
|
|
lambda2_z_.data(),
|
|
k_eps,
|
|
buffer.data(),
|
|
output_data);
|
|
}
|
|
// Truncate to original size.
|
|
nonzeros.resize(n);
|
|
zeros.resize(D - n);
|
|
}
|
|
for (; i < N; i++, self_data += D, output_data += D) {
|
|
box_cox_mixed_lambda<T>(
|
|
self_data,
|
|
nonzeros,
|
|
zeros,
|
|
lambda1_.data(),
|
|
lambda2_.data(),
|
|
lambda2_z_.data(),
|
|
k_eps,
|
|
buffer.data(),
|
|
output_data);
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
template
|
|
void compute_batch_box_cox__avx2_fma<float>(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const float* self_data,
|
|
const float* __restrict lambda1_data,
|
|
const float* __restrict lambda2_data,
|
|
float* output_data);
|
|
|
|
template
|
|
void compute_batch_box_cox__avx2_fma<double>(
|
|
std::size_t N,
|
|
std::size_t D,
|
|
std::size_t block_size,
|
|
const double* self_data,
|
|
const double* __restrict lambda1_data,
|
|
const double* __restrict lambda2_data,
|
|
double* output_data);
|
|
|
|
} // namespace caffe2::detail
|
|
#endif
|