mirror of
https://github.com/zebrajr/pytorch.git
synced 2025-12-06 12:20:52 +01:00
[Build] Make PyTorch compilable with gcc-14 on ARM (#157867)
Fixes numerous ICEs in vreg allocations for SVE+BF16
```
/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: error: unrecognizable insn:
25 | #pragma omp parallel
| ^~~
(insn 257 256 258 30 (set (reg:VNx8BF 449 [ bf16_vec1_217 ])
(unspec:VNx8BF [
(reg:VNx8BF 455)
(reg:VNx8BF 456)
] UNSPEC_IORF)) "/pytorch/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h":228:31 discrim 1 -1
(nil))
during RTL pass: vregs
/pytorch/aten/src/ATen/ParallelOpenMP.h:25:9: internal compiler error: in extract_insn, at recog.cc:2812
0xd73c33 internal_error(char const*, ...)
???:0
0xd73d1f fancy_abort(char const*, int, char const*)
???:0
0x890053 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)
???:0
0x890087 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*)
???:0
0x1379093 extract_insn(rtx_insn*)
???:0
```
And one in RTL-expand pass while compiling Activation.cpp
```
during RTL pass: expand
In file included from /pytorch/aten/src/ATen/native/cpu/Activation.cpp:12,
from /pytorch/build/aten/src/ATen/native/cpu/Activation.cpp.DEFAULT.cpp:1:
/pytorch/aten/src/ATen/native/cpu/Activation.cpp: In lambda function:
/pytorch/aten/src/ATen/native/cpu/Activation.cpp:94:7: internal compiler error: Segmentation fault
94 | });
| ^
/pytorch/aten/src/ATen/Dispatch.h:201:7: note: in definition of macro 'AT_DISPATCH_SWITCH'
201 | __VA_ARGS__ \
| ^~~~~~~~~~~
/pytorch/aten/src/ATen/Dispatch.h:72:3: note: in expansion of macro 'AT_PRIVATE_CASE_TYPE_USING_HINT'
72 | AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pytorch/aten/src/ATen/Dispatch.h:214:3: note: in expansion of macro 'AT_DISPATCH_CASE'
214 | AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
| ^~~~~~~~~~~~~~~~
/pytorch/aten/src/ATen/Dispatch.h:218:34: note: in expansion of macro 'AT_DISPATCH_CASE_FLOATING_TYPES'
218 | AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pytorch/aten/src/ATen/native/cpu/Activation.cpp:70:5: note: in expansion of macro 'AT_DISPATCH_FLOATING_TYPES'
70 | AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&] {
| ^~~~~~~~~~~~~~~~~~~~~~~~~~
0xd73c33 internal_error(char const*, ...)
???:0
0x134f987 rebuild_jump_labels(rtx_insn*)
???:0
```
Interestingly enough, attempt to compile `Unfold2d.cpp` for `-march=armv8-a+sve` (i.e. without sve+bf16) support also causes ICE
```
/pytorch/aten/src/ATen/native/cpu/Unfold2d.cpp:221:1: error: unrecognizable insn:
221 | }
| ^
(insn 2918 2917 2919 296 (set (reg:VNx8BI 5917)
(unspec:VNx16BI [
(reg:VNx8BI 5920)
(reg:VNx8BI 5922)
(const_vector:VNx4BI [
(const_int 0 [0]) repeated x8
])
] UNSPEC_TRN1_CONV)) "/usr/include/aarch64-linux-gnu/bits/string_fortified.h":29:33 discrim 1 -1
(expr_list:REG_EQUAL (const_vector:VNx8BI [
(const_int 1 [0x1]) repeated x9
(const_int 0 [0])
(const_int 1 [0x1]) repeated x2
(const_int 0 [0]) repeated x4
])
(nil)))
during RTL pass: vregs
```
Which could be worked around by adding
```patch
diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp
index 8ef0741e77af0a..59c76505dd6246 100644
--- a/aten/src/ATen/native/cpu/Unfold2d.cpp
+++ b/aten/src/ATen/native/cpu/Unfold2d.cpp
@@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(
/* note: due to write issues, this one cannot be parallelized as well as
* unfolded2d_copy */
+#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE)
+// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
+__attribute__((optimize("no-tree-vectorize")))
+#endif
void unfolded2d_acc_kernel(
ScalarType dtype,
void *finput_data,
```
Fixes https://github.com/pytorch/pytorch/issues/157842
Pull Request resolved: https://github.com/pytorch/pytorch/pull/157867
Approved by: https://github.com/atalman, https://github.com/Skylion007
This commit is contained in:
parent
ab8874bd26
commit
d6237721c0
|
|
@ -220,8 +220,12 @@ class Vectorized<BFloat16> {
|
|||
Vectorized<BFloat16> le(const Vectorized<BFloat16>& other) const;
|
||||
};
|
||||
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
|
||||
const Vectorized<c10::BFloat16>& a) {
|
||||
#if defined(__GNUC__) && __GNUC__ == 14
|
||||
// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
inline std::tuple<Vectorized<float>, Vectorized<float>>
|
||||
convert_bfloat16_float(const Vectorized<c10::BFloat16>& a) {
|
||||
static_assert(
|
||||
Vectorized<c10::BFloat16>::size() == 2 * Vectorized<float>::size());
|
||||
auto zero = svreinterpret_bf16_f32(svdup_n_f32(0.0f));
|
||||
|
|
|
|||
|
|
@ -26,6 +26,10 @@ namespace at::native {
|
|||
|
||||
namespace {
|
||||
|
||||
#if defined(__GNUC__) && __GNUC__ == 14 && defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
|
||||
// Workaround for gcc-14.2.0 ICE during RTL pass: expand when compiling for NEON
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const TensorBase &input) {
|
||||
if (at::isReducedFloatingType(input.scalar_type())) {
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(input.scalar_type(), "log_sigmoid_cpu", [&]() {
|
||||
|
|
|
|||
|
|
@ -169,6 +169,10 @@ static void unfolded2d_acc_channels_last(
|
|||
|
||||
/* note: due to write issues, this one cannot be parallelized as well as
|
||||
* unfolded2d_copy */
|
||||
#if defined(__GNUC__) && __GNUC__ == 14 && defined(__ARM_FEATURE_SVE) && !defined(__ARM_FEATURE_BF16)
|
||||
// Workaround for gcc-14.2.0 ICE during RTL pass: vregs when compiling for SVE without BF16
|
||||
__attribute__((optimize("no-tree-vectorize")))
|
||||
#endif
|
||||
void unfolded2d_acc_kernel(
|
||||
ScalarType dtype,
|
||||
void *finput_data,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user