Use SLEEF functions for NEON vectors on macOS ARM64 (#70354)

Summary:
We noticed that on M1 Macs Tranformer network profiles are dominated by scalar `exp` and `erff` functions (for softmax and GELU).

The NEON `Vectorized<float>` implementation does not use SLEEF functions in order to compile on mobile platforms. However, SLEEF is already compiled on macOS ARM64 and is safe to use there. This change adds another implementation of `Vectorized<float>` that uses SLEEF functions. This implementation is only used on macOS ARM64.

This change speeds up e.g. prediction of spaCy transformer models by 20% on M1 Macs.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/70354

Reviewed By: albanD

Differential Revision: D33659540

Pulled By: kimishpatel

fbshipit-source-id: b8f02a61321873fc60778190a005c466c7d0cc0c
(cherry picked from commit 71286a207c)
This commit is contained in:
Daniël de Kok 2022-02-07 13:45:59 -08:00 committed by PyTorch MergeBot
parent f0f49a1153
commit d50211860a
2 changed files with 184 additions and 66 deletions

View File

@ -277,6 +277,7 @@ if(IOS)
set(USE_BREAKPAD OFF)
endif()
option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
@ -690,6 +691,10 @@ if(USE_PYTORCH_QNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
endif()
if(USE_SLEEF_FOR_ARM_VEC256)
string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
endif()
if(USE_XNNPACK)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
endif()

View File

@ -6,6 +6,11 @@
#include <ATen/cpu/vec/intrinsics.h>
#include <ATen/cpu/vec/vec_base.h>
#include <c10/util/irange.h>
#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
#include <sleef.h>
#endif
// Sleef offers vectorized versions of some transcedentals
// such as sin, cos, tan etc..
// However for now opting for STL, since we are not building
@ -31,6 +36,12 @@ inline namespace CPU_CAPABILITY {
#error "Big endian is not supported."
#endif
#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
#else
#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
#endif
template<int index, bool mask_val>
struct BlendRegs {
static float32x4_t impl(
@ -324,68 +335,121 @@ public:
return *this;
}
Vectorized<float> acos() const {
return map(std::acos);
return USE_SLEEF(
Vectorized<float>(Sleef_acosf4_u10(values.val[0]), Sleef_acosf4_u10(values.val[1])),
map(std::acos)
);
}
Vectorized<float> asin() const {
return map(std::asin);
return USE_SLEEF(
Vectorized<float>(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])),
map(std::asin)
);
}
Vectorized<float> atan() const {
return map(std::atan);
return USE_SLEEF(
Vectorized<float>(Sleef_atanf4_u10(values.val[0]), Sleef_atanf4_u10(values.val[1])),
map(std::atan)
);
}
Vectorized<float> atan2(const Vectorized<float> &exp) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_exp[size()];
store(tmp);
exp.store(tmp_exp);
for (const auto i : c10::irange(size())) {
tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
}
return loadu(tmp);
USE_SLEEF(
{
return Vectorized<float>(Sleef_atan2f4_u10(values.val[0], exp.values.val[0]),
Sleef_atan2f4_u10(values.val[1], exp.values.val[1]));
},
{
__at_align__ float tmp[size()];
__at_align__ float tmp_exp[size()];
store(tmp);
exp.store(tmp_exp);
for (const auto i : c10::irange(size())) {
tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
}
return loadu(tmp);
}
)
}
Vectorized<float> copysign(const Vectorized<float> &sign) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_sign[size()];
store(tmp);
sign.store(tmp_sign);
for (size_type i = 0; i < size(); i++) {
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
}
return loadu(tmp);
USE_SLEEF(
{
return Vectorized<float>(Sleef_copysignf4(values.val[0], sign.values.val[0]),
Sleef_copysignf4(values.val[1], sign.values.val[1]));
},
{
__at_align__ float tmp[size()];
__at_align__ float tmp_sign[size()];
store(tmp);
sign.store(tmp_sign);
for (size_type i = 0; i < size(); i++) {
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
}
return loadu(tmp);
}
)
}
Vectorized<float> erf() const {
return map(std::erf);
return USE_SLEEF(
Vectorized<float>(Sleef_erff4_u10(values.val[0]), Sleef_erff4_u10(values.val[1])),
map(std::erf);
);
}
Vectorized<float> erfc() const {
return map(std::erfc);
return USE_SLEEF(
Vectorized<float>(Sleef_erfcf4_u15(values.val[0]), Sleef_erfcf4_u15(values.val[1])),
map(std::erfc)
);
}
Vectorized<float> erfinv() const {
return map(calc_erfinv);
}
Vectorized<float> exp() const {
return map(std::exp);
return USE_SLEEF(
Vectorized<float>(Sleef_expf4_u10(values.val[0]), Sleef_expf4_u10(values.val[1])),
map(std::exp)
);
}
Vectorized<float> expm1() const {
return map(std::expm1);
return USE_SLEEF(
Vectorized<float>(Sleef_expm1f4_u10(values.val[0]), Sleef_expm1f4_u10(values.val[1])),
map(std::expm1)
);
}
Vectorized<float> fmod(const Vectorized<float>& q) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_q[size()];
store(tmp);
q.store(tmp_q);
for (const auto i : c10::irange(size())) {
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
}
return loadu(tmp);
USE_SLEEF(
{
return Vectorized<float>(Sleef_fmodf4(values.val[0], q.values.val[0]),
Sleef_fmodf4(values.val[1], q.values.val[1]));
},
{
__at_align__ float tmp[size()];
__at_align__ float tmp_q[size()];
store(tmp);
q.store(tmp_q);
for (const auto i : c10::irange(size())) {
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
}
return loadu(tmp);
}
)
}
Vectorized<float> hypot(const Vectorized<float> &b) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (const auto i : c10::irange(size())) {
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
}
return loadu(tmp);
USE_SLEEF(
{
return Vectorized<float>(Sleef_hypotf4_u05(values.val[0], b.values.val[0]),
Sleef_hypotf4_u05(values.val[1], b.values.val[1]));
},
{
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (const auto i : c10::irange(size())) {
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
}
return loadu(tmp);
}
)
}
Vectorized<float> i0() const {
return map(calc_i0);
@ -414,39 +478,71 @@ public:
return loadu(tmp);
}
Vectorized<float> log() const {
return map(std::log);
return USE_SLEEF(
Vectorized<float>(Sleef_logf4_u10(values.val[0]), Sleef_logf4_u10(values.val[1])),
map(std::log)
);
}
Vectorized<float> log10() const {
return map(std::log10);
return USE_SLEEF(
Vectorized<float>(Sleef_log10f4_u10(values.val[0]), Sleef_log10f4_u10(values.val[1])),
map(std::log10)
);
}
Vectorized<float> log1p() const {
return map(std::log1p);
return USE_SLEEF(
Vectorized<float>(Sleef_log1pf4_u10(values.val[0]), Sleef_log1pf4_u10(values.val[1])),
map(std::log1p)
);
}
Vectorized<float> log2() const {
return map(std::log2);
return USE_SLEEF(
Vectorized<float>(Sleef_log2f4_u10(values.val[0]), Sleef_log2f4_u10(values.val[1])),
map(std::log2)
);
}
Vectorized<float> nextafter(const Vectorized<float> &b) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (const auto i : c10::irange(size())) {
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
}
return loadu(tmp);
USE_SLEEF(
{
return Vectorized<float>(Sleef_nextafterf4(values.val[0], b.values.val[0]),
Sleef_nextafterf4(values.val[1], b.values.val[1]));
},
{
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (const auto i : c10::irange(size())) {
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
}
return loadu(tmp);
}
)
}
Vectorized<float> frac() const;
Vectorized<float> sin() const {
return map(std::sin);
return USE_SLEEF(
Vectorized<float>(Sleef_sinf4_u10(values.val[0]), Sleef_sinf4_u10(values.val[1])),
map(std::sin)
);
}
Vectorized<float> sinh() const {
return map(std::sinh);
return USE_SLEEF(
Vectorized<float>(Sleef_sinhf4_u10(values.val[0]), Sleef_sinhf4_u10(values.val[1])),
map(std::sinh)
);
}
Vectorized<float> cos() const {
return map(std::cos);
return USE_SLEEF(
Vectorized<float>(Sleef_cosf4_u10(values.val[0]), Sleef_cosf4_u10(values.val[1])),
map(std::cos)
);
}
Vectorized<float> cosh() const {
return map(std::cosh);
return USE_SLEEF(
Vectorized<float>(Sleef_coshf4_u10(values.val[0]), Sleef_coshf4_u10(values.val[1])),
map(std::cosh)
);
}
Vectorized<float> ceil() const {
return map(at::native::ceil_impl);
@ -464,10 +560,16 @@ public:
return map(at::native::round_impl);
}
Vectorized<float> tan() const {
return map(std::tan);
return USE_SLEEF(
Vectorized<float>(Sleef_tanf4_u10(values.val[0]), Sleef_tanf4_u10(values.val[1])),
map(std::tan)
);
}
Vectorized<float> tanh() const {
return map(std::tanh);
return USE_SLEEF(
Vectorized<float>(Sleef_tanhf4_u10(values.val[0]), Sleef_tanhf4_u10(values.val[1])),
map(std::tanh)
);
}
Vectorized<float> trunc() const {
float32x4_t r0 = vrndq_f32(values.val[0]);
@ -475,7 +577,10 @@ public:
return Vectorized<float>(r0, r1);
}
Vectorized<float> lgamma() const {
return map(std::lgamma);
return USE_SLEEF(
Vectorized<float>(Sleef_lgammaf4_u10(values.val[0]), Sleef_lgammaf4_u10(values.val[1])),
map(std::lgamma)
);
}
Vectorized<float> sqrt() const {
return Vectorized<float>(
@ -491,14 +596,22 @@ public:
return this->sqrt().reciprocal();
}
Vectorized<float> pow(const Vectorized<float> &exp) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_exp[size()];
store(tmp);
exp.store(tmp_exp);
for (const auto i : c10::irange(size())) {
tmp[i] = std::pow(tmp[i], tmp_exp[i]);
}
return loadu(tmp);
USE_SLEEF(
{
return Vectorized<float>(Sleef_powf4_u10(values.val[0], exp.values.val[0]),
Sleef_powf4_u10(values.val[1], exp.values.val[1]));
},
{
__at_align__ float tmp[size()];
__at_align__ float tmp_exp[size()];
store(tmp);
exp.store(tmp_exp);
for (const auto i : c10::irange(size())) {
tmp[i] = std::pow(tmp[i], tmp_exp[i]);
}
return loadu(tmp);
}
)
}
Vectorized<float> operator==(const Vectorized<float>& other) const {
float32x4_t r0 =