Revert "[PyTorch] Improve aarch64 performance of bfloat16 ops (#166028)"

This reverts commit 3e77a2b478. Otherwise it fails ARM build with older compilers with errors that looks as follows: ``` vec128_bfloat16_neon.h:666:12: error: operation not permitted on type ‘bfloat16_t’ 666 | return (-x) * y - z; ``` For more self-contained example see https://godbolt.org/z/bbY4xWh45 (that compiles the same code using clang-15 and clang-19)
2025-12-06 12:20:52 +01:00 · 2025-10-29 13:35:59 -07:00 · 2025-10-29 13:35:59 -07:00 · 2c9f877fa7
commit 2c9f877fa7
parent fc540cefd4
1 changed files with 1 additions and 92 deletions
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -354,47 +354,9 @@ class Vectorized<c10::BFloat16> : public Vectorized16<

  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
  Vectorized frac() const;
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
-
-#ifdef __ARM_FEATURE_BF16
-  Vectorized<c10::BFloat16> neg() const {
-    return -values;
-  }
-  Vectorized<c10::BFloat16> reciprocal() const {
-    return 1.0f / values;
-  }
-  Vectorized<c10::BFloat16> operator==(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values == other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator!=(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values != other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator<(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values < other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator<=(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values <= other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator>(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values > other.values;
-  }
-
-  Vectorized<c10::BFloat16> operator>=(
-      const Vectorized<c10::BFloat16>& other) const {
-    return values >= other.values;
-  }
-#else
-  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
@ -402,7 +364,6 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
-#endif

 #undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
 #undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
@ -451,52 +412,28 @@ template <>
 Vectorized<c10::BFloat16> inline operator+(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x + y;
-#else
  return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
-#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator-(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x - y;
-#else
  return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
-#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator*(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x * y;
-#else
  return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
-#endif
 }

 template <>
 Vectorized<c10::BFloat16> inline operator/(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  return x / y;
-#else
  return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
-#endif
 }

 // frac. Implement this here so we can use subtraction
@ -607,19 +544,12 @@ Vectorized<c10::BFloat16> inline fmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return x * y + z;
-#else
  // NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16!  Also,
  // vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
  // elements, not the bottom and top half, so they don't seem
  // particularly useful here. Ideally we would include dot product in
  // the Vectorized interface...
  return a * b + c;
-#endif
 }

 template <>
@ -627,15 +557,8 @@ Vectorized<c10::BFloat16> inline fnmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return (-x) * y + z;
-#else
  // See NOTE [BF16 FMA] above.
  return -a * b + c;
-#endif
 }

 template <>
@ -643,15 +566,8 @@ Vectorized<c10::BFloat16> inline fmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return x * y - z;
-#else
  // See NOTE [BF16 FMA] above.
  return a * b - c;
-#endif
 }

 template <>
@ -659,15 +575,8 @@ Vectorized<c10::BFloat16> inline fnmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#ifdef __ARM_FEATURE_BF16
-  bfloat16x8_t x = a;
-  bfloat16x8_t y = b;
-  bfloat16x8_t z = c;
-  return (-x) * y - z;
-#else
  // See NOTE [BF16 FMA] above.
  return -a * b - c;
-#endif
 }

 #endif // !defined(C10_MOBILE) && defined(__aarch64__)