[Build] Make PyTorch compilable with gcc-14 on ARM (#157867)

Fixes numerous ICEs in vreg allocations for SVE+BF16
```
/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: error: unrecognizable insn:
   25 | #pragma omp parallel
      |         ^~~
(insn 257 256 258 30 (set (reg:VNx8BF 449 [ bf16_vec1_217 ])
        (unspec:VNx8BF [
                (reg:VNx8BF 455)
                (reg:VNx8BF 456)
            ] UNSPEC_IORF)) "/pytorch/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h":228:31 discrim 1 -1
     (nil))
during RTL pass: vregs
/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: internal compiler error: in extract_insn, at recog.cc:2812
0xd73c33 internal_error(char const*, ...)
	???:0
0xd73d1f fancy_abort(char const*, int, char const*)
	???:0
0x890053 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)
	???:0
0x890087 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*)
	???:0
0x1379093 extract_insn(rtx_insn*)
	???:0

```
And one in RTL-expand pass while compiling Activation.cpp
```
during RTL pass: expand
In file included from /pytorch/aten/src/ATen/native/cpu/Activation.cpp:12,
                 from /pytorch/build/aten/src/ATen/native/cpu/Activation.cpp.DEFAULT.cpp:1:
/pytorch/aten/src/ATen/native/cpu/Activation.cpp: In lambda function:
/pytorch/aten/src/ATen/native/cpu/Activation.cpp:94:7: internal compiler error: Segmentation fault
   94 |       });
      |       ^
/pytorch/aten/src/ATen/Dispatch.h:201:7: note: in definition of macro 'AT_DISPATCH_SWITCH'
  201 |       __VA_ARGS__                                                           \
      |       ^~~~~~~~~~~
/pytorch/aten/src/ATen/Dispatch.h:72:3: note: in expansion of macro 'AT_PRIVATE_CASE_TYPE_USING_HINT'
   72 |   AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
      |   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pytorch/aten/src/ATen/Dispatch.h:214:3: note: in expansion of macro 'AT_DISPATCH_CASE'
  214 |   AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
      |   ^~~~~~~~~~~~~~~~
/pytorch/aten/src/ATen/Dispatch.h:218:34: note: in expansion of macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
  218 |   AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
      |                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pytorch/aten/src/ATen/native/cpu/Activation.cpp:70:5: note: in expansion of macro 'AT_DISPATCH_FLOATING_TYPES'
   70 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] {
      |     ^~~~~~~~~~~~~~~~~~~~~~~~~~
0xd73c33 internal_error(char const*, ...)
	???:0
0x134f987 rebuild_jump_labels(rtx_insn*)
	???:0
```

Interestingly enough, attempt to compile `Unfold2d.cpp` for `-march=armv8-a+sve` (i.e. without sve+bf16) support also causes ICE
```
/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:221:1: error: unrecognizable insn:
  221 | }
      | ^
(insn 2918 2917 2919 296 (set (reg:VNx8BI 5917)
        (unspec:VNx16BI [
                (reg:VNx8BI 5920)
                (reg:VNx8BI 5922)
                (const_vector:VNx4BI [
                        (const_int 0 [0]) repeated x8
                    ])
            ] UNSPEC_TRN1_CONV)) "/usr/include/aarch64-linux-gnu/bits/string_fortified.h":29:33 discrim 1 -1
     (expr_list:REG_EQUAL (const_vector:VNx8BI [
                (const_int 1 [0x1]) repeated x9
                (const_int 0 [0])
                (const_int 1 [0x1]) repeated x2
                (const_int 0 [0]) repeated x4
            ])
        (nil)))
during RTL pass: vregs
```

Which could be worked around by adding
```patch
diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
index 8ef0741e77af0a..59c76505dd6246 100644
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(

 /* note: due to write issues, this one cannot be parallelized as well as
  * unfolded2d_copy */
+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE)
+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
+__attribute__((optimize("no-tree-vectorize")))
+#endif
 void unfolded2d_acc_kernel(
     ScalarType dtype,
     void *finput_data,
```

Fixes https://github.com/pytorch/pytorch/issues/157842

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157867
Approved by: https://github.com/atalman, https://github.com/Skylion007
This commit is contained in:
Nikita Shulga 2025-07-08 18:47:20 -07:00 committed by PyTorch MergeBot
parent ab8874bd26
commit d6237721c0
3 changed files with 14 additions and 2 deletions

View File

@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const; Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
}; };
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float( #if defined(__GNUC__) && __GNUC__ == 14
const Vectorized<c10::BFloat16>& a) { // Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
__attribute__((optimize("no-tree-vectorize")))
#endif
inline std::tuple<Vectorized<float>, Vectorized<float>>
convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
static_assert( static_assert(
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size()); Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f)); auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));

View File

@ -26,6 +26,10 @@ namespace at::native {
namespace { namespace {
#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
__attribute__((optimize("no-tree-vectorize")))
#endif
static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) { static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
if (at::isReducedFloatingType(input.scalar_type())) { if (at::isReducedFloatingType(input.scalar_type())) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() { AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {

View File

@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(
/* note: due to write issues, this one cannot be parallelized as well as /* note: due to write issues, this one cannot be parallelized as well as
* unfolded2d_copy */ * unfolded2d_copy */
#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
__attribute__((optimize("no-tree-vectorize")))
#endif
void unfolded2d_acc_kernel( void unfolded2d_acc_kernel(
ScalarType dtype, ScalarType dtype,
void *finput_data, void *finput_data,