We see two functions that use these two saturating subtractions, but are otherwise the same. they generate very similar initial LLVM IR, except that the generic function is called in a slightly different way (requiring some extra allocas).
define void @specific(ptr dead_on_unwind noalias nocapture noundef writable sret([16 x i8]) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b, ptr noalias nocapture noundef align 16 dereferenceable(16) %c) unnamed_addr {
start:
%0 = alloca [16 x i8], align 16
%1 = alloca [16 x i8], align 16
%2 = alloca [16 x i8], align 16
%3 = alloca [16 x i8], align 16
%4 = alloca [16 x i8], align 16
call void @llvm.lifetime.start.p0(i64 16, ptr %4)
%5 = load <4 x i32>, ptr %b, align 16
store <4 x i32> %5, ptr %3, align 16
%6 = load <4 x i32>, ptr %c, align 16
store <4 x i32> %6, ptr %2, align 16
call void @core::core_arch::aarch64::neon::generated::vqdmull_high_laneq_s32::h66aa645ca0aefe90(ptr noalias nocapture noundef sret([16 x i8]) align 16 dereferenceable(16) %4, ptr noalias nocapture noundef align 16 dereferenceable(16) %3, ptr noalias nocapture noundef align 16 dereferenceable(16) %2)
%_4 = load <2 x i64>, ptr %4, align 16
call void @llvm.lifetime.end.p0(i64 16, ptr %4)
%7 = load <2 x i64>, ptr %a, align 16
store <2 x i64> %7, ptr %1, align 16
store <2 x i64> %_4, ptr %0, align 16
; after InlinerPass this call becomes a call to `@llvm.aarch64.neon.sqsub.v2i64`
call void @core::core_arch::arm_shared::neon::generated::vqsubq_s64::h1887dd6c0650937c(ptr noalias nocapture noundef sret([16 x i8]) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %1, ptr noalias nocapture noundef align 16 dereferenceable(16) %0)
ret void
}
define void @generic(ptr dead_on_unwind noalias nocapture noundef writable sret([16 x i8]) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b, ptr noalias nocapture noundef align 16 dereferenceable(16) %c) unnamed_addr {
start:
%0 = alloca [16 x i8], align 16
%1 = alloca [16 x i8], align 16
%2 = alloca [16 x i8], align 16
call void @llvm.lifetime.start.p0(i64 16, ptr %2)
%3 = load <4 x i32>, ptr %b, align 16
store <4 x i32> %3, ptr %1, align 16
%4 = load <4 x i32>, ptr %c, align 16
store <4 x i32> %4, ptr %0, align 16
call void @core::core_arch::aarch64::neon::generated::vqdmull_high_laneq_s32::h66aa645ca0aefe90(ptr noalias nocapture noundef sret([16 x i8]) align 16 dereferenceable(16) %2, ptr noalias nocapture noundef align 16 dereferenceable(16) %1, ptr noalias nocapture noundef align 16 dereferenceable(16) %0)
%_4 = load <2 x i64>, ptr %2, align 16
call void @llvm.lifetime.end.p0(i64 16, ptr %2)
%5 = load <2 x i64>, ptr %a, align 16
%6 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %5, <2 x i64> %_4)
store <2 x i64> %6, ptr %_0, align 16
ret void
}
As a user, I expect the generic variant to eventually be lowered to the specific one.
This is unexpected, and seems to imply that many optimizations are missed when using the generic SIMD intrinsics (at least for Neon). My intuition is that the lowering of the generic to the specific instruction occurs too late. It should occur earlier so that it can participate in backend-specific optimizations.
The generic `@llvm.ssub.sat.v2i64` intrinsic optimizes less well than the target-specific `@llvm.aarch64.neon.sqsub.v2i64` intrinsic.
This godbolt shows the issue https://godbolt.org/z/4qEe3xM9v
We see two functions that use these two saturating subtractions, but are otherwise the same. they generate very similar initial LLVM IR, except that the generic function is called in a slightly different way (requiring some extra `alloca`s).
```llvm
define void @specific(ptr dead_on_unwind noalias nocapture noundef writable sret([16 x i8]) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b, ptr noalias nocapture noundef align 16 dereferenceable(16) %c) unnamed_addr {
start:
%0 = alloca [16 x i8], align 16
%1 = alloca [16 x i8], align 16
%2 = alloca [16 x i8], align 16
%3 = alloca [16 x i8], align 16
%4 = alloca [16 x i8], align 16
call void @llvm.lifetime.start.p0(i64 16, ptr %4)
%5 = load <4 x i32>, ptr %b, align 16
store <4 x i32> %5, ptr %3, align 16
%6 = load <4 x i32>, ptr %c, align 16
store <4 x i32> %6, ptr %2, align 16
call void @core::core_arch::aarch64::neon::generated::vqdmull_high_laneq_s32::h66aa645ca0aefe90(ptr noalias nocapture noundef sret([16 x i8]) align 16 dereferenceable(16) %4, ptr noalias nocapture noundef align 16 dereferenceable(16) %3, ptr noalias nocapture noundef align 16 dereferenceable(16) %2)
%_4 = load <2 x i64>, ptr %4, align 16
call void @llvm.lifetime.end.p0(i64 16, ptr %4)
%7 = load <2 x i64>, ptr %a, align 16
store <2 x i64> %7, ptr %1, align 16
store <2 x i64> %_4, ptr %0, align 16
; after InlinerPass this call becomes a call to `@llvm.aarch64.neon.sqsub.v2i64`
call void @core::core_arch::arm_shared::neon::generated::vqsubq_s64::h1887dd6c0650937c(ptr noalias nocapture noundef sret([16 x i8]) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %1, ptr noalias nocapture noundef align 16 dereferenceable(16) %0)
ret void
}
define void @generic(ptr dead_on_unwind noalias nocapture noundef writable sret([16 x i8]) align 16 dereferenceable(16) %_0, ptr noalias nocapture noundef align 16 dereferenceable(16) %a, ptr noalias nocapture noundef align 16 dereferenceable(16) %b, ptr noalias nocapture noundef align 16 dereferenceable(16) %c) unnamed_addr {
start:
%0 = alloca [16 x i8], align 16
%1 = alloca [16 x i8], align 16
%2 = alloca [16 x i8], align 16
call void @llvm.lifetime.start.p0(i64 16, ptr %2)
%3 = load <4 x i32>, ptr %b, align 16
store <4 x i32> %3, ptr %1, align 16
%4 = load <4 x i32>, ptr %c, align 16
store <4 x i32> %4, ptr %0, align 16
call void @core::core_arch::aarch64::neon::generated::vqdmull_high_laneq_s32::h66aa645ca0aefe90(ptr noalias nocapture noundef sret([16 x i8]) align 16 dereferenceable(16) %2, ptr noalias nocapture noundef align 16 dereferenceable(16) %1, ptr noalias nocapture noundef align 16 dereferenceable(16) %0)
%_4 = load <2 x i64>, ptr %2, align 16
call void @llvm.lifetime.end.p0(i64 16, ptr %2)
%5 = load <2 x i64>, ptr %a, align 16
%6 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %5, <2 x i64> %_4)
store <2 x i64> %6, ptr %_0, align 16
ret void
}
```
As a user, I expect the generic variant to eventually be lowered to the specific one.
When the intrinsics are used on their own, this is in fact the case: https://godbolt.org/z/ErjETo3bh. Both functions emit the `sqsub` instruction. This logic appears to be [implemented here](https://github.com/llvm/llvm-project/blob/806ed2625e9569bdb55a13a2b1f9c3e71293fda6/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp#L20840-L20842).
But in my example, there is an optimization that `@llvm.aarch64.neon.sqsub.v2i64` participates in, but the generic `@llvm.ssub.sat.v2i64` does not.
```asm
specific:
ldr q0, [x1]
ldr q1, [x2]
ldr q2, [x0]
sqdmlsl2 v2.2d, v0.4s, v1.s[1]
str q2, [x8]
ret
generic:
ldr q0, [x1]
ldr q1, [x2]
sqdmull2 v0.2d, v0.4s, v1.s[1]
ldr q1, [x0]
sqsub v0.2d, v1.2d, v0.2d
str q0, [x8]
ret
```
This is unexpected, and seems to imply that many optimizations are missed when using the generic SIMD intrinsics (at least for Neon). My intuition is that the lowering of the generic to the specific instruction occurs too late. It should occur earlier so that it can participate in backend-specific optimizations.
- [discussion in the rust zullip](https://rust-lang.zulipchat.com/#narrow/stream/187780-t-compiler.2Fwg-llvm/topic/simd.20fails.20to.20specialize.20for.20target)
- PR where this problem was spotted https://github.com/rust-lang/stdarch/pull/1575
The generic
@llvm.ssub.sat.v2i64
intrinsic optimizes less well than the target-specific@llvm.aarch64.neon.sqsub.v2i64
intrinsic.This godbolt shows the issue https://godbolt.org/z/4qEe3xM9v
We see two functions that use these two saturating subtractions, but are otherwise the same. they generate very similar initial LLVM IR, except that the generic function is called in a slightly different way (requiring some extra
alloca
s).As a user, I expect the generic variant to eventually be lowered to the specific one.
When the intrinsics are used on their own, this is in fact the case: https://godbolt.org/z/ErjETo3bh. Both functions emit the
sqsub
instruction. This logic appears to be implemented here.But in my example, there is an optimization that
@llvm.aarch64.neon.sqsub.v2i64
participates in, but the generic@llvm.ssub.sat.v2i64
does not.This is unexpected, and seems to imply that many optimizations are missed when using the generic SIMD intrinsics (at least for Neon). My intuition is that the lowering of the generic to the specific instruction occurs too late. It should occur earlier so that it can participate in backend-specific optimizations.