Closed malfet closed 2 months ago
Can be fixed by moving static initialization of winograd NEON transforms into implementation_list
method using following patch
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
index 35d61fa94d..fdee43672e 100644
--- a/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms_fp16.cpp
@@ -38,14 +38,14 @@ void a64_fp16_6x6(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t
#define IMPL(HEIGHT, WIDTH, FUNC, DRIVER) new Transform ## DRIVER <__fp16, __fp16>(#FUNC, HEIGHT, WIDTH, FUNC)
-static const TransformImplementation<__fp16> transforms_fp16[] = {
- { IMPL(6, 6, a64_fp16_6x6, Unpadded) },
- { nullptr },
-};
template <>
const TransformImplementation<__fp16> *implementation_list(void)
{
+static const TransformImplementation<__fp16> transforms_fp16[] = {
+ { IMPL(6, 6, a64_fp16_6x6, Unpadded) },
+ { nullptr },
+};
return transforms_fp16;
}
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
index c39b1dc083..b312952340 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms_fp16.cpp
@@ -37,14 +37,14 @@ void a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, const __fp16 *, __fp
new Transform ## DRIVER <__fp16, __fp16>(#FUNC, OUT_HEIGHT, OUT_WIDTH, KERN_HEIGHT, KERN_WIDTH, FUNC)
+template <>
+const TransformImplementation<__fp16> *implementation_list(void)
+{
static const TransformImplementation<__fp16> transforms_fp16[] = {
{ IMPL(4, 4, 3, 3, a64_fp16_4x4_3x3, Unpadded) },
{ nullptr }
};
-template <>
-const TransformImplementation<__fp16> *implementation_list(void)
-{
return transforms_fp16;
}
@@ -52,4 +52,4 @@ const TransformImplementation<__fp16> *implementation_list(void)
} // namespace winograd
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
\ No newline at end of file
+#endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
index 6c8bbe07cf..91f11a4a7b 100644
--- a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
@@ -36,14 +36,14 @@ void *a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, s
#define IMPL(KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN) \
new Transform<__fp16>(#KERN, KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN)
-static const TransformImplementation<__fp16> transforms_fp16[] = {
- { IMPL(3, 3, 6, 6, a64_fp16_4x4_3x3) },
- { nullptr }
-};
template <>
const TransformImplementation<__fp16> *implementation_list(void)
{
+static const TransformImplementation<__fp16> transforms_fp16[] = {
+ { IMPL(3, 3, 6, 6, a64_fp16_4x4_3x3) },
+ { nullptr }
+};
return transforms_fp16;
}
After applying the patch library can be imported and tests are passing:
$ LD_LIBRARY_PATH=build build/examples/graph_lenet
build/examples/graph_lenet
Threads : 1
Target : Neon
Data type : F32
Data layout : NHWC
Tuner enabled? : false
Cache enabled? : false
Tuner mode : Normal
Tuner file :
MLGO file :
Fast math enabled? : false
Test passed
Hi @malfet
I can't reproduce the problem either on v24.04 or main. Could you please provide more details on how to reproduce the problem? Are you using the prebuilt binaries from github or building from source?
See the details below please.
user@raspberrypi:~/ $ LD_LIBRARY_PATH=.:$LD_LIBRARY_PATH ./graph_lenet
./graph_lenet
Threads : 1
Target : Neon
Data type : F32
Data layout : NHWC
Tuner enabled? : false
Cache enabled? : false
Tuner mode : Normal
Tuner file :
MLGO file :
Fast math enabled? : false
Test passed
user@raspberrypi:~/pabtel01 $ strings libarm_compute.so | grep version
arm_compute_version=v0.0-unreleased Build options: {'os': 'linux', 'opencl': '0', 'asserts': '1', 'neon': '1', 'arch': 'armv8a',
'benchmark_examples': '0', 'examples': '1', 'multi_isa': '1', 'debug': '0', 'validation_tests': '0', 'cppthreads': '0', 'openmp': '1', 'logging': '0', 'fixed_format_kernels': '1', 'toolchain_prefix': 'aarch64-none-linux-gnu-'} Git hash=b'02f7616dc4d58b68848a85b66e494bd259cf1c38'
pi@raspberrypi-9:~/ $ lscpu
Architecture: aarch64
CPU op-mode(s): 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 4
On-line CPU(s) list: 0-3
Vendor ID: ARM
Model name: Cortex-A72
Model: 3
Thread(s) per core: 1
Core(s) per cluster: 4
Socket(s): -
Cluster(s): 1
Stepping: r0p3
CPU(s) scaling MHz: 36%
CPU max MHz: 1800.0000
CPU min MHz: 600.0000
BogoMIPS: 108.00
Flags: fp asimd evtstrm crc32 cpuid
Caches (sum of all):
L1d: 128 KiB (4 instances)
L1i: 192 KiB (4 instances)
L2: 1 MiB (1 instance)
Vulnerabilities:
Gather data sampling: Not affected
Itlb multihit: Not affected
L1tf: Not affected
Mds: Not affected
Meltdown: Not affected
Mmio stale data: Not affected
Retbleed: Not affected
Spec rstack overflow: Not affected
Spec store bypass: Vulnerable
Spectre v1: Mitigation; __user pointer sanitization
Spectre v2: Vulnerable
Srbds: Not affected
Tsx async abort: Not affected
Hi @malfet
The patch fixing the problem has been merged to main and it will be included in v24.08.
Hope this helps.
Thank you for the quick review, @morgolock I've backported the patch to v24.04 that is used to build PyTorch v2.4 binaries and expect it'll be sufficient to fix upcoming PyTorch-2.4.1 release for Cortex-A72 users
arm_compute_version=v24.04 Build options: {'Werror': '1', 'debug': '0', 'neon': '1', 'opencl': '0', 'os': 'linux', 'openmp': '1', 'cppthreads': '0', 'arch': 'armv8a', 'multi_isa': '1', 'fixed_format_kernels': '1', 'build': 'native'} Git hash=b'4fda7a803eaadf00ba36bd532481a33c18952089'
Platform: AWS A1
Operating System: Linux
Problem description: Compute Library v24.04 compiled with multiisa support will crash with SIGILL on Raspberry PI , AWS A1 instance (reported against PyTorch as https://github.com/pytorch/pytorch/issues/132032 ) :