ARM-software / ComputeLibrary

The Compute Library is a set of computer vision and machine learning functions optimised for both Arm CPUs and GPUs using SIMD technologies.
MIT License
2.83k stars 775 forks source link

Compute Library v24.04 is no longer usable on Cortex-A72 if compile with multiisa support #1133

Closed malfet closed 2 months ago

malfet commented 2 months ago

arm_compute_version=v24.04 Build options: {'Werror': '1', 'debug': '0', 'neon': '1', 'opencl': '0', 'os': 'linux', 'openmp': '1', 'cppthreads': '0', 'arch': 'armv8a', 'multi_isa': '1', 'fixed_format_kernels': '1', 'build': 'native'} Git hash=b'4fda7a803eaadf00ba36bd532481a33c18952089'

Platform: AWS A1

Operating System: Linux

Problem description: Compute Library v24.04 compiled with multiisa support will crash with SIGILL on Raspberry PI , AWS A1 instance (reported against PyTorch as https://github.com/pytorch/pytorch/issues/132032 ) :

# LD_LIBRARY_PATH=build gdb build/examples/graph_lenet
GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-120.el7
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "aarch64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /root/ComputeLibrary/build/examples/graph_lenet...(no debugging symbols found)...done.
(gdb) r
Starting program: /root/ComputeLibrary/build/examples/graph_lenet 
warning: Error disabling address space randomization: Operation not permitted
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib64/libthread_db.so.1".

Program received signal SIGILL, Illegal instruction.
0x0000ffffbd866880 in std::string::_Rep::_M_dispose () from build/libarm_compute.so
Missing separate debuginfos, use: debuginfo-install glibc-2.17-326.el7_9.3.aarch64 libgcc-4.8.5-44.el7.aarch64 libgomp-4.8.5-44.el7.aarch64 libstdc++-4.8.5-44.el7.aarch64
(gdb) bt
#0  0x0000ffffbd866880 in std::string::_Rep::_M_dispose () from build/libarm_compute.so
#1  0x0000ffffbd2fc69c in _GLOBAL__sub_I_input_transforms_fp16.cpp () from build/libarm_compute.so
#2  0x0000ffffbdbc3470 in _dl_init_internal () from /lib/ld-linux-aarch64.so.1
#3  0x0000ffffbdbb6164 in _dl_start_user () from /lib/ld-linux-aarch64.so.1
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
malfet commented 2 months ago

Can be fixed by moving static initialization of winograd NEON transforms into implementation_list method using following patch

diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
index 35d61fa94d..fdee43672e 100644
--- a/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
@@ -38,14 +38,14 @@ void a64_fp16_6x6(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t

 #define IMPL(HEIGHT, WIDTH, FUNC, DRIVER) new Transform ## DRIVER <__fp16, __fp16>(#FUNC, HEIGHT, WIDTH, FUNC)

-static const TransformImplementation<__fp16> transforms_fp16[] = {
-  { IMPL(6, 6, a64_fp16_6x6, Unpadded) },
-  { nullptr },
-};

 template <>
 const TransformImplementation<__fp16> *implementation_list(void)
 {
+static const TransformImplementation<__fp16> transforms_fp16[] = {
+  { IMPL(6, 6, a64_fp16_6x6, Unpadded) },
+  { nullptr },
+};
   return transforms_fp16;
 }

diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
index c39b1dc083..b312952340 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
@@ -37,14 +37,14 @@ void a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, const __fp16 *, __fp
   new Transform ## DRIVER <__fp16, __fp16>(#FUNC, OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, FUNC)

+template <>
+const TransformImplementation<__fp16> *implementation_list(void)
+{
 static const TransformImplementation<__fp16> transforms_fp16[] = {
   { IMPL(4, 4, 3, 3, a64_fp16_4x4_3x3, Unpadded) },
   { nullptr }
 };

-template <>
-const TransformImplementation<__fp16> *implementation_list(void)
-{
   return transforms_fp16;
 }

@@ -52,4 +52,4 @@ const TransformImplementation<__fp16> *implementation_list(void)
 }  // namespace winograd
 }  // namespace arm_conv

-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
\ No newline at end of file
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
index 6c8bbe07cf..91f11a4a7b 100644
--- a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
@@ -36,14 +36,14 @@ void *a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, s
 #define IMPL(KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN) \
   new Transform<__fp16>(#KERN, KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN)

-static const TransformImplementation<__fp16> transforms_fp16[] = {
-  { IMPL(3, 3, 6, 6, a64_fp16_4x4_3x3) },
-  { nullptr }
-};

 template <>
 const TransformImplementation<__fp16> *implementation_list(void)
 {
+static const TransformImplementation<__fp16> transforms_fp16[] = {
+  { IMPL(3, 3, 6, 6, a64_fp16_4x4_3x3) },
+  { nullptr }
+};
   return transforms_fp16;
 }

After applying the patch library can be imported and tests are passing:

$ LD_LIBRARY_PATH=build build/examples/graph_lenet

build/examples/graph_lenet

Threads : 1
Target : Neon
Data type : F32
Data layout : NHWC
Tuner enabled? : false
Cache enabled? : false
Tuner mode : Normal
Tuner file : 
MLGO file : 
Fast math enabled? : false

Test passed
morgolock commented 2 months ago

Hi @malfet

I can't reproduce the problem either on v24.04 or main. Could you please provide more details on how to reproduce the problem? Are you using the prebuilt binaries from github or building from source?

See the details below please.

user@raspberrypi:~/ $ LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./graph_lenet

./graph_lenet

Threads : 1
Target : Neon
Data type : F32
Data layout : NHWC
Tuner enabled? : false
Cache enabled? : false
Tuner mode : Normal
Tuner file : 
MLGO file : 
Fast math enabled? : false

Test passed

user@raspberrypi:~/pabtel01 $ strings libarm_compute.so | grep version
arm_compute_version=v0.0-unreleased Build options: {'os': 'linux', 'opencl': '0', 'asserts': '1', 'neon': '1', 'arch': 'armv8a', 
'benchmark_examples': '0', 'examples': '1', 'multi_isa': '1', 'debug': '0', 'validation_tests': '0', 'cppthreads': '0', 'openmp': '1', 'logging': '0', 'fixed_format_kernels': '1', 'toolchain_prefix': 'aarch64-none-linux-gnu-'} Git hash=b'02f7616dc4d58b68848a85b66e494bd259cf1c38'

pi@raspberrypi-9:~/ $ lscpu
Architecture:            aarch64
  CPU op-mode(s):        32-bit, 64-bit
  Byte Order:            Little Endian
CPU(s):                  4
  On-line CPU(s) list:   0-3
Vendor ID:               ARM
  Model name:            Cortex-A72
    Model:               3
    Thread(s) per core:  1
    Core(s) per cluster: 4
    Socket(s):           -
    Cluster(s):          1
    Stepping:            r0p3
    CPU(s) scaling MHz:  36%
    CPU max MHz:         1800.0000
    CPU min MHz:         600.0000
    BogoMIPS:            108.00
    Flags:               fp asimd evtstrm crc32 cpuid
Caches (sum of all):     
  L1d:                   128 KiB (4 instances)
  L1i:                   192 KiB (4 instances)
  L2:                    1 MiB (1 instance)
Vulnerabilities:         
  Gather data sampling:  Not affected
  Itlb multihit:         Not affected
  L1tf:                  Not affected
  Mds:                   Not affected
  Meltdown:              Not affected
  Mmio stale data:       Not affected
  Retbleed:              Not affected
  Spec rstack overflow:  Not affected
  Spec store bypass:     Vulnerable
  Spectre v1:            Mitigation; __user pointer sanitization
  Spectre v2:            Vulnerable
  Srbds:                 Not affected
  Tsx async abort:       Not affected
morgolock commented 2 months ago

Hi @malfet

The patch fixing the problem has been merged to main and it will be included in v24.08.

Hope this helps.

malfet commented 2 months ago

Thank you for the quick review, @morgolock I've backported the patch to v24.04 that is used to build PyTorch v2.4 binaries and expect it'll be sufficient to fix upcoming PyTorch-2.4.1 release for Cortex-A72 users