Open heiher opened 4 days ago
If the caller has a target feature attribute lsx
, then the code generation is as expected.
#![feature(stdarch_loongarch)]
#![feature(loongarch_target_feature)]
use std::arch::loongarch64::*;
#[target_feature(enable = "lsx")]
pub unsafe fn simd(s: i32) -> i32 {
lsx_vpickve2gr_b::<0>(lsx_vreplgr2vr_b(s))
}
Even without target feature attributes, x86 is still inlined.
use std::arch::x86_64::*;
pub unsafe fn simd(s: i32) -> i32 {
let a = _mm_set1_epi8(s as i8);
let b = _mm_srl_epi64(a, a);
_mm_movemask_epi8(b) as i32
}
I strongly suspect this is an additional optimization from the LLVM backend, as the two are clearly different when compared to the Rust MIR.
sse2
:fn simd(_1: i32) -> i32 {
debug s => _1;
let mut _0: i32;
let _2: std::arch::x86_64::__m128i;
let mut _3: i8;
scope 1 {
debug a => _2;
let _4: std::arch::x86_64::__m128i;
scope 2 {
debug b => _4;
}
}
bb0: {
StorageLive(_3);
_3 = copy _1 as i8 (IntToInt);
_2 = std::arch::x86_64::_mm_set1_epi8(move _3) -> [return: bb1, unwind continue];
}
bb1: {
StorageDead(_3);
_4 = std::arch::x86_64::_mm_srl_epi64(copy _2, move _2) -> [return: bb2, unwind continue];
}
bb2: {
_0 = std::arch::x86_64::_mm_movemask_epi8(move _4) -> [return: bb3, unwind continue];
}
bb3: {
return;
}
}
sse2
:fn simd(_1: i32) -> i32 {
debug s => _1;
let mut _0: i32;
let _2: std::arch::x86_64::__m128i;
let mut _3: i8;
scope 1 {
debug a => _2;
let _4: std::arch::x86_64::__m128i;
scope 2 {
debug b => _4;
scope 6 (inlined std::arch::x86_64::_mm_movemask_epi8) {
let _9: core::core_arch::simd::i8x16;
let mut _11: core::core_arch::simd::i8x16;
let mut _12: u32;
let mut _13: u16;
scope 7 {
let _10: core::core_arch::simd::i8x16;
scope 8 {
}
}
}
}
scope 5 (inlined std::arch::x86_64::_mm_srl_epi64) {
let mut _6: core::core_arch::simd::i64x2;
let mut _7: core::core_arch::simd::i64x2;
let mut _8: core::core_arch::simd::i64x2;
}
}
scope 3 (inlined std::arch::x86_64::_mm_set1_epi8) {
scope 4 (inlined _mm_set_epi8) {
let mut _5: core::core_arch::simd::i8x16;
}
}
bb0: {
StorageLive(_3);
_3 = copy _1 as i8 (IntToInt);
StorageLive(_5);
_5 = core::core_arch::simd::i8x16::new(copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, move _3) -> [return: bb1, unwind continue];
}
bb1: {
_2 = move _5 as std::arch::x86_64::__m128i (Transmute);
StorageDead(_5);
StorageDead(_3);
StorageLive(_6);
StorageLive(_7);
_7 = <__m128i as core::core_arch::x86::m128iExt>::as_i64x2(copy _2) -> [return: bb2, unwind continue];
}
bb2: {
StorageLive(_8);
_8 = <__m128i as core::core_arch::x86::m128iExt>::as_i64x2(move _2) -> [return: bb3, unwind continue];
}
bb3: {
_6 = core::core_arch::x86::sse2::psrlq(move _7, move _8) -> [return: bb4, unwind unreachable];
}
bb4: {
StorageDead(_8);
StorageDead(_7);
_4 = move _6 as std::arch::x86_64::__m128i (Transmute);
StorageDead(_6);
StorageLive(_9);
StorageLive(_10);
_9 = core::core_arch::simd::i8x16::splat(const 0_i8) -> [return: bb5, unwind continue];
}
bb5: {
StorageLive(_11);
_11 = <__m128i as core::core_arch::x86::m128iExt>::as_i8x16(move _4) -> [return: bb6, unwind continue];
}
bb6: {
_10 = std::intrinsics::simd::simd_lt::<core::core_arch::simd::i8x16, core::core_arch::simd::i8x16>(move _11, move _9) -> [return: bb7, unwind unreachable];
}
bb7: {
StorageDead(_11);
StorageLive(_12);
StorageLive(_13);
_13 = simd_bitmask::<core::core_arch::simd::i8x16, u16>(move _10) -> [return: bb8, unwind unreachable];
}
bb8: {
_12 = move _13 as u32 (IntToInt);
StorageDead(_13);
_0 = move _12 as i32 (IntToInt);
StorageDead(_12);
StorageDead(_10);
StorageDead(_9);
return;
}
}
@heiher Without having looked at the details here, this usually means you need to implement the areInlineCompatible() TTI hook.
@heiher Without having looked at the details here, this usually means you need to implement the areInlineCompatible() TTI hook.
I agree with you. I just caught it.
EDIT: LLVM PR: https://github.com/llvm/llvm-project/pull/117493
I tried this code:
I expected to see this happen:
The
lsx
intrinsics are inlined withinsimd
functions when thelsx
target feature is globally enabled.Instead, this happened:
Meta
rustc --version --verbose
:rustc -Z unstable-options --print target-spec-json
: