Use SLEEF functions for NEON vectors on macOS ARM64 (#70354)

Summary: We noticed that on M1 Macs Tranformer network profiles are dominated by scalar `exp` and `erff` functions (for softmax and GELU). The NEON `Vectorized<float>` implementation does not use SLEEF functions in order to compile on mobile platforms. However, SLEEF is already compiled on macOS ARM64 and is safe to use there. This change adds another implementation of `Vectorized<float>` that uses SLEEF functions. This implementation is only used on macOS ARM64. This change speeds up e.g. prediction of spaCy transformer models by 20% on M1 Macs. Pull Request resolved: https://github.com/pytorch/pytorch/pull/70354 Reviewed By: albanD Differential Revision: D33659540 Pulled By: kimishpatel fbshipit-source-id: b8f02a61321873fc60778190a005c466c7d0cc0c (cherry picked from commit 71286a207c)
2025-12-06 12:20:52 +01:00 · 2022-02-07 13:45:59 -08:00 · 2022-02-07 13:45:59 -08:00 · d50211860a
commit d50211860a
parent f0f49a1153
2 changed files with 184 additions and 66 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -277,6 +277,7 @@ if(IOS)
  set(USE_BREAKPAD OFF)
 endif()

+option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
@ -690,6 +691,10 @@ if(USE_PYTORCH_QNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()

+if(USE_SLEEF_FOR_ARM_VEC256)
+  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
+endif()
+
 if(USE_XNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
 endif()
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h
@ -6,6 +6,11 @@
 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>
+
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#endif
+
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@ -31,6 +36,12 @@ inline namespace CPU_CAPABILITY {
 #error "Big endian is not supported."
 #endif

+#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
 template<int index, bool mask_val>
 struct BlendRegs {
  static float32x4_t impl(
@ -324,68 +335,121 @@ public:
    return *this;
  }
  Vectorized<float> acos() const {
-    return map(std::acos);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_acosf4_u10(values.val[0]), Sleef_acosf4_u10(values.val[1])),
+      map(std::acos)
+    );
  }
  Vectorized<float> asin() const {
-    return map(std::asin);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])),
+      map(std::asin)
+    );
  }
  Vectorized<float> atan() const {
-    return map(std::atan);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_atanf4_u10(values.val[0]), Sleef_atanf4_u10(values.val[1])),
+      map(std::atan)
+    );
  }
  Vectorized<float> atan2(const Vectorized<float> &exp) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_exp[size()];
-    store(tmp);
-    exp.store(tmp_exp);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
-    }
-    return loadu(tmp);
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_atan2f4_u10(values.val[0], exp.values.val[0]),
+                                 Sleef_atan2f4_u10(values.val[1], exp.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_exp[size()];
+        store(tmp);
+        exp.store(tmp_exp);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::atan2(tmp[i], tmp_exp[i]);
+        }
+        return loadu(tmp);
+      }
+    )
  }
  Vectorized<float> copysign(const Vectorized<float> &sign) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_sign[size()];
-    store(tmp);
-    sign.store(tmp_sign);
-    for (size_type i = 0; i < size(); i++) {
-      tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
-    }
-    return loadu(tmp);
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_copysignf4(values.val[0], sign.values.val[0]),
+                                 Sleef_copysignf4(values.val[1], sign.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_sign[size()];
+        store(tmp);
+        sign.store(tmp_sign);
+        for (size_type i = 0; i < size(); i++) {
+          tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+        }
+        return loadu(tmp);
+      }
+    )
  }
  Vectorized<float> erf() const {
-    return map(std::erf);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_erff4_u10(values.val[0]), Sleef_erff4_u10(values.val[1])),
+      map(std::erf);
+    );
  }
  Vectorized<float> erfc() const {
-    return map(std::erfc);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_erfcf4_u15(values.val[0]), Sleef_erfcf4_u15(values.val[1])),
+      map(std::erfc)
+    );
  }
  Vectorized<float> erfinv() const {
    return map(calc_erfinv);
  }
  Vectorized<float> exp() const {
-    return map(std::exp);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_expf4_u10(values.val[0]), Sleef_expf4_u10(values.val[1])),
+      map(std::exp)
+    );
  }
  Vectorized<float> expm1() const {
-    return map(std::expm1);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_expm1f4_u10(values.val[0]), Sleef_expm1f4_u10(values.val[1])),
+      map(std::expm1)
+    );
  }
  Vectorized<float> fmod(const Vectorized<float>& q) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_q[size()];
-    store(tmp);
-    q.store(tmp_q);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = std::fmod(tmp[i], tmp_q[i]);
-    }
-    return loadu(tmp);
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_fmodf4(values.val[0], q.values.val[0]),
+                                 Sleef_fmodf4(values.val[1], q.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      }
+    )
  }
  Vectorized<float> hypot(const Vectorized<float> &b) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_b[size()];
-    store(tmp);
-    b.store(tmp_b);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = std::hypot(tmp[i], tmp_b[i]);
-    }
-    return loadu(tmp);
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_hypotf4_u05(values.val[0], b.values.val[0]),
+                                 Sleef_hypotf4_u05(values.val[1], b.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
  }
  Vectorized<float> i0() const {
    return map(calc_i0);
@ -414,39 +478,71 @@ public:
    return loadu(tmp);
  }
  Vectorized<float> log() const {
-    return map(std::log);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_logf4_u10(values.val[0]), Sleef_logf4_u10(values.val[1])),
+      map(std::log)
+    );
  }
  Vectorized<float> log10() const {
-    return map(std::log10);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_log10f4_u10(values.val[0]), Sleef_log10f4_u10(values.val[1])),
+      map(std::log10)
+    );
  }
  Vectorized<float> log1p() const {
-    return map(std::log1p);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_log1pf4_u10(values.val[0]), Sleef_log1pf4_u10(values.val[1])),
+      map(std::log1p)
+    );
  }
  Vectorized<float> log2() const {
-    return map(std::log2);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_log2f4_u10(values.val[0]), Sleef_log2f4_u10(values.val[1])),
+      map(std::log2)
+    );
  }
  Vectorized<float> nextafter(const Vectorized<float> &b) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_b[size()];
-    store(tmp);
-    b.store(tmp_b);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
-    }
-    return loadu(tmp);
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_nextafterf4(values.val[0], b.values.val[0]),
+                                 Sleef_nextafterf4(values.val[1], b.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
  }
  Vectorized<float> frac() const;
  Vectorized<float> sin() const {
-    return map(std::sin);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_sinf4_u10(values.val[0]), Sleef_sinf4_u10(values.val[1])),
+      map(std::sin)
+    );
  }
  Vectorized<float> sinh() const {
-    return map(std::sinh);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_sinhf4_u10(values.val[0]), Sleef_sinhf4_u10(values.val[1])),
+      map(std::sinh)
+    );
  }
  Vectorized<float> cos() const {
-    return map(std::cos);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_cosf4_u10(values.val[0]), Sleef_cosf4_u10(values.val[1])),
+      map(std::cos)
+    );
  }
  Vectorized<float> cosh() const {
-    return map(std::cosh);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_coshf4_u10(values.val[0]), Sleef_coshf4_u10(values.val[1])),
+      map(std::cosh)
+    );
  }
  Vectorized<float> ceil() const {
    return map(at::native::ceil_impl);
@ -464,10 +560,16 @@ public:
    return map(at::native::round_impl);
  }
  Vectorized<float> tan() const {
-    return map(std::tan);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_tanf4_u10(values.val[0]), Sleef_tanf4_u10(values.val[1])),
+      map(std::tan)
+    );
  }
  Vectorized<float> tanh() const {
-    return map(std::tanh);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_tanhf4_u10(values.val[0]), Sleef_tanhf4_u10(values.val[1])),
+      map(std::tanh)
+    );
  }
  Vectorized<float> trunc() const {
    float32x4_t r0 = vrndq_f32(values.val[0]);
@ -475,7 +577,10 @@ public:
    return Vectorized<float>(r0, r1);
  }
  Vectorized<float> lgamma() const {
-    return map(std::lgamma);
+    return USE_SLEEF(
+      Vectorized<float>(Sleef_lgammaf4_u10(values.val[0]), Sleef_lgammaf4_u10(values.val[1])),
+      map(std::lgamma)
+    );
  }
  Vectorized<float> sqrt() const {
    return Vectorized<float>(
@ -491,14 +596,22 @@ public:
    return this->sqrt().reciprocal();
  }
  Vectorized<float> pow(const Vectorized<float> &exp) const {
-    __at_align__ float tmp[size()];
-    __at_align__ float tmp_exp[size()];
-    store(tmp);
-    exp.store(tmp_exp);
-    for (const auto i : c10::irange(size())) {
-      tmp[i] = std::pow(tmp[i], tmp_exp[i]);
-    }
-    return loadu(tmp);
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_powf4_u10(values.val[0], exp.values.val[0]),
+                                 Sleef_powf4_u10(values.val[1], exp.values.val[1]));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_exp[size()];
+        store(tmp);
+        exp.store(tmp_exp);
+        for (const auto i : c10::irange(size())) {
+          tmp[i] = std::pow(tmp[i], tmp_exp[i]);
+        }
+        return loadu(tmp);
+      }
+    )
  }
  Vectorized<float> operator==(const Vectorized<float>& other) const {
    float32x4_t r0 =