[Build] Make PyTorch compilable with gcc-14 on ARM (#157867)

Fixes numerous ICEs in vreg allocations for SVE+BF16 ``` /pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: error: unrecognizable insn: 25 | #pragma omp parallel | ^~~ (insn 257 256 258 30 (set (reg:VNx8BF 449 [ bf16_vec1_217 ]) (unspec:VNx8BF [ (reg:VNx8BF 455) (reg:VNx8BF 456) ] UNSPEC_IORF)) "/pytorch/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h":228:31 discrim 1 -1 (nil)) during RTL pass: vregs /pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: internal compiler error: in extract_insn, at recog.cc:2812 0xd73c33 internal_error(char const*, ...) ???:0 0xd73d1f fancy_abort(char const*, int, char const*) ???:0 0x890053 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*) ???:0 0x890087 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*) ???:0 0x1379093 extract_insn(rtx_insn*) ???:0 ``` And one in RTL-expand pass while compiling Activation.cpp ``` during RTL pass: expand In file included from /pytorch/aten/src/ATen/native/cpu/Activation.cpp:12, from /pytorch/build/aten/src/ATen/native/cpu/Activation.cpp.DEFAULT.cpp:1: /pytorch/aten/src/ATen/native/cpu/Activation.cpp: In lambda function: /pytorch/aten/src/ATen/native/cpu/Activation.cpp:94:7: internal compiler error: Segmentation fault 94 | }); | ^ /pytorch/aten/src/ATen/Dispatch.h:201:7: note: in definition of macro 'AT_DISPATCH_SWITCH' 201 | __VA_ARGS__ \ | ^~~~~~~~~~~ /pytorch/aten/src/ATen/Dispatch.h:72:3: note: in expansion of macro 'AT_PRIVATE_CASE_TYPE_USING_HINT' 72 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /pytorch/aten/src/ATen/Dispatch.h:214:3: note: in expansion of macro 'AT_DISPATCH_CASE' 214 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \ | ^~~~~~~~~~~~~~~~ /pytorch/aten/src/ATen/Dispatch.h:218:34: note: in expansion of macro 'AT_DISPATCH_CASE_FLOATING_TYPES' 218 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /pytorch/aten/src/ATen/native/cpu/Activation.cpp:70:5: note: in expansion of macro 'AT_DISPATCH_FLOATING_TYPES' 70 | AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] { | ^~~~~~~~~~~~~~~~~~~~~~~~~~ 0xd73c33 internal_error(char const*, ...) ???:0 0x134f987 rebuild_jump_labels(rtx_insn*) ???:0 ``` Interestingly enough, attempt to compile `Unfold2d.cpp` for `-march=armv8-a+sve` (i.e. without sve+bf16) support also causes ICE ``` /pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:221:1: error: unrecognizable insn: 221 | } | ^ (insn 2918 2917 2919 296 (set (reg:VNx8BI 5917) (unspec:VNx16BI [ (reg:VNx8BI 5920) (reg:VNx8BI 5922) (const_vector:VNx4BI [ (const_int 0 [0]) repeated x8 ]) ] UNSPEC_TRN1_CONV)) "/usr/include/aarch64-linux-gnu/bits/string_fortified.h":29:33 discrim 1 -1 (expr_list:REG_EQUAL (const_vector:VNx8BI [ (const_int 1 [0x1]) repeated x9 (const_int 0 [0]) (const_int 1 [0x1]) repeated x2 (const_int 0 [0]) repeated x4 ]) (nil))) during RTL pass: vregs ``` Which could be worked around by adding ```patch diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index 8ef0741e77af0a..59c76505dd6246 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last( /* note: due to write issues, this one cannot be parallelized as well as * unfolded2d_copy */ +#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) +// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE +__attribute__((optimize("no-tree-vectorize"))) +#endif void unfolded2d_acc_kernel( ScalarType dtype, void *finput_data, ``` Fixes https://github.com/pytorch/pytorch/issues/157842 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157867 Approved by: https://github.com/atalman, https://github.com/Skylion007
2025-12-06 12:20:52 +01:00 · 2025-07-08 18:47:20 -07:00 · 2025-07-08 18:47:20 -07:00 · d6237721c0
commit d6237721c0
parent ab8874bd26
3 changed files with 14 additions and 2 deletions
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
  Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
 };

-inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
-    const Vectorized<c10::BFloat16>& a) {
+#if defined(__GNUC__) && __GNUC__ == 14
+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
+__attribute__((optimize("no-tree-vectorize")))
+#endif
+inline std::tuple<Vectorized<float>, Vectorized<float>>
+convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
  static_assert(
      Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
  auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@ -26,6 +26,10 @@ namespace at::native {

 namespace {

+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
+// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
+__attribute__((optimize("no-tree-vectorize")))
+#endif
 static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
  if (at::isReducedFloatingType(input.scalar_type())) {
    AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(

 /* note: due to write issues, this one cannot be parallelized as well as
 * unfolded2d_copy */
+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
+__attribute__((optimize("no-tree-vectorize")))
+#endif
 void unfolded2d_acc_kernel(
    ScalarType dtype,
    void *finput_data,