[LoongArch] SIMD intrinsics not fully inlined in caller with target feature globally enabled

heiher commented 4 days ago

I tried this code:

#![feature(stdarch_loongarch)]
use std::arch::loongarch64::*;

pub unsafe fn simd(s: i32) -> i32 {
    lsx_vpickve2gr_b::<0>(lsx_vreplgr2vr_b(s))
}

rustc --crate-type lib -C opt-level=3 --emit llvm-ir -o lsx.ll lsx.rs

I expected to see this happen:

The lsx intrinsics are inlined within simd functions when the lsx target feature is globally enabled.

; loong64::simd
; Function Attrs: nofree nosync nounwind memory(none) uwtable
define noundef i32 @_ZN7loong644simd17h54d99178ac0d0f82E(i32 noundef signext %s) unnamed_addr #0 {
start:
  %_2 = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 noundef %s) #2
  %_0 = tail call noundef i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> %_2, i32 noundef 0) #2
  ret i32 %_0
}

; Function Attrs: nofree nosync nounwind memory(none)
declare <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32) unnamed_addr #1

; Function Attrs: nofree nosync nounwind memory(none)
declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32 immarg) unnamed_addr #1

attributes #0 = { nofree nosync nounwind memory(none) uwtable "target-cpu"="generic" "target-features"="+f,+d,+lsx,+lsx,+d,+f" }

Instead, this happened:

; core::core_arch::loongarch64::lsx::generated::lsx_vpickve2gr_b
; Function Attrs: inlinehint nofree nosync nounwind memory(argmem: read) uwtable
define internal fastcc noundef i32 @_ZN4core9core_arch11loongarch643lsx9generated16lsx_vpickve2gr_b17hbf4a6d8f95630043E(ptr noalias nocapture noundef readonly align 16 dereferenceable(16) %a) unnamed_addr #0 {
start:
  %0 = load <16 x i8>, ptr %a, align 16
  %_0 = tail call noundef i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> %0, i32 noundef 0) #4
  ret i32 %_0
}

; core::core_arch::loongarch64::lsx::generated::lsx_vreplgr2vr_b
; Function Attrs: inlinehint nofree nosync nounwind memory(argmem: write) uwtable
define internal fastcc void @_ZN4core9core_arch11loongarch643lsx9generated16lsx_vreplgr2vr_b17h0060558a0a7e8678E(ptr dead_on_unwind noalias nocapture noundef writable writeonly align 16 dereferenceable(16) %_0, i32 noundef signext %a) unnamed_addr #1 {
start:
  %0 = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 noundef %a) #4
  store <16 x i8> %0, ptr %_0, align 16
  ret void
}

; loong64::simd
; Function Attrs: nofree nosync nounwind memory(none) uwtable
define noundef i32 @_ZN7loong644simd17h54d99178ac0d0f82E(i32 noundef signext %s) unnamed_addr #2 {
start:
  %0 = alloca [16 x i8], align 16
; call core::core_arch::loongarch64::lsx::generated::lsx_vreplgr2vr_b
  call fastcc void @_ZN4core9core_arch11loongarch643lsx9generated16lsx_vreplgr2vr_b17h0060558a0a7e8678E(ptr noalias nocapture noundef nonnull align 16 dereferenceable(16) %0, i32 noundef signext %s)
; call core::core_arch::loongarch64::lsx::generated::lsx_vpickve2gr_b
  %_0 = call fastcc noundef i32 @_ZN4core9core_arch11loongarch643lsx9generated16lsx_vpickve2gr_b17hbf4a6d8f95630043E(ptr noalias nocapture noundef nonnull align 16 dereferenceable(16) %0)
  ret i32 %_0
}

; Function Attrs: nofree nosync nounwind memory(none)
declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32 immarg) unnamed_addr #3

; Function Attrs: nofree nosync nounwind memory(none)
declare <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32) unnamed_addr #3

attributes #0 = { inlinehint nofree nosync nounwind memory(argmem: read) uwtable "target-cpu"="generic" "target-features"="+f,+d,+lsx,+lsx,+d,+f" }
attributes #1 = { inlinehint nofree nosync nounwind memory(argmem: write) uwtable "target-cpu"="generic" "target-features"="+f,+d,+lsx,+lsx,+d,+f" }
attributes #2 = { nofree nosync nounwind memory(none) uwtable "target-cpu"="generic" "target-features"="+f,+d,+lsx" }

Meta

rustc --version --verbose:

rustc 1.84.0-nightly (3fee0f12e 2024-11-20)
binary: rustc
commit-hash: 3fee0f12e4f595948f8f54f57c8b7a7a58127124
commit-date: 2024-11-20
host: loongarch64-unknown-linux-gnu
release: 1.84.0-nightly
LLVM version: 19.1.3

rustc -Z unstable-options --print target-spec-json:

{
  "arch": "loongarch64",
  "code-model": "medium",
  "crt-objects-fallback": "false",
  "crt-static-respected": true,
  "data-layout": "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128",
  "direct-access-external-data": false,
  "dynamic-linking": true,
  "env": "gnu",
  "features": "+f,+d,+lsx",
  "has-rpath": true,
  "has-thread-local": true,
  "linker-flavor": "gnu-cc",
  "llvm-abiname": "lp64d",
  "llvm-target": "loongarch64-unknown-linux-gnu",
  "max-atomic-width": 64,
  "metadata": {
    "description": "LoongArch64 Linux, LP64D ABI (kernel 5.19, glibc 2.36)",
    "host_tools": true,
    "std": true,
    "tier": 2
  },
  "os": "linux",
  "position-independent-executables": true,
  "relro-level": "full",
  "supported-sanitizers": [
    "address",
    "leak",
    "memory",
    "thread",
    "cfi"
  ],
  "supported-split-debuginfo": [
    "packed",
    "unpacked",
    "off"
  ],
  "supports-xray": true,
  "target-family": [
    "unix"
  ],
  "target-pointer-width": "64"
}

heiher commented 4 days ago

If the caller has a target feature attribute lsx, then the code generation is as expected.

#![feature(stdarch_loongarch)]
#![feature(loongarch_target_feature)]
use std::arch::loongarch64::*;

#[target_feature(enable = "lsx")]
pub unsafe fn simd(s: i32) -> i32 {
    lsx_vpickve2gr_b::<0>(lsx_vreplgr2vr_b(s))
}

heiher commented 4 days ago

Even without target feature attributes, x86 is still inlined.

use std::arch::x86_64::*;

pub unsafe fn simd(s: i32) -> i32 {
    let a = _mm_set1_epi8(s as i8);
    let b = _mm_srl_epi64(a, a);
    _mm_movemask_epi8(b) as i32
}

I strongly suspect this is an additional optimization from the LLVM backend, as the two are clearly different when compared to the Rust MIR.

without target feature attribute sse2:

fn simd(_1: i32) -> i32 {
    debug s => _1;
    let mut _0: i32;
    let _2: std::arch::x86_64::__m128i;
    let mut _3: i8;
    scope 1 {
        debug a => _2;
        let _4: std::arch::x86_64::__m128i;
        scope 2 {
            debug b => _4;
        }
    }

    bb0: {
        StorageLive(_3);
        _3 = copy _1 as i8 (IntToInt);
        _2 = std::arch::x86_64::_mm_set1_epi8(move _3) -> [return: bb1, unwind continue];
    }

    bb1: {
        StorageDead(_3);
        _4 = std::arch::x86_64::_mm_srl_epi64(copy _2, move _2) -> [return: bb2, unwind continue];
    }

    bb2: {
        _0 = std::arch::x86_64::_mm_movemask_epi8(move _4) -> [return: bb3, unwind continue];
    }

    bb3: {
        return;
    }
}

with target feature attribute sse2:

fn simd(_1: i32) -> i32 {
    debug s => _1;
    let mut _0: i32;
    let _2: std::arch::x86_64::__m128i;
    let mut _3: i8;
    scope 1 {
        debug a => _2;
        let _4: std::arch::x86_64::__m128i;
        scope 2 {
            debug b => _4;
            scope 6 (inlined std::arch::x86_64::_mm_movemask_epi8) {
                let _9: core::core_arch::simd::i8x16;
                let mut _11: core::core_arch::simd::i8x16;
                let mut _12: u32;
                let mut _13: u16;
                scope 7 {
                    let _10: core::core_arch::simd::i8x16;
                    scope 8 {
                    }
                }
            }
        }
        scope 5 (inlined std::arch::x86_64::_mm_srl_epi64) {
            let mut _6: core::core_arch::simd::i64x2;
            let mut _7: core::core_arch::simd::i64x2;
            let mut _8: core::core_arch::simd::i64x2;
        }
    }
    scope 3 (inlined std::arch::x86_64::_mm_set1_epi8) {
        scope 4 (inlined _mm_set_epi8) {
            let mut _5: core::core_arch::simd::i8x16;
        }
    }

    bb0: {
        StorageLive(_3);
        _3 = copy _1 as i8 (IntToInt);
        StorageLive(_5);
        _5 = core::core_arch::simd::i8x16::new(copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, copy _3, move _3) -> [return: bb1, unwind continue];
    }

    bb1: {
        _2 = move _5 as std::arch::x86_64::__m128i (Transmute);
        StorageDead(_5);
        StorageDead(_3);
        StorageLive(_6);
        StorageLive(_7);
        _7 = <__m128i as core::core_arch::x86::m128iExt>::as_i64x2(copy _2) -> [return: bb2, unwind continue];
    }

    bb2: {
        StorageLive(_8);
        _8 = <__m128i as core::core_arch::x86::m128iExt>::as_i64x2(move _2) -> [return: bb3, unwind continue];
    }

    bb3: {
        _6 = core::core_arch::x86::sse2::psrlq(move _7, move _8) -> [return: bb4, unwind unreachable];
    }

    bb4: {
        StorageDead(_8);
        StorageDead(_7);
        _4 = move _6 as std::arch::x86_64::__m128i (Transmute);
        StorageDead(_6);
        StorageLive(_9);
        StorageLive(_10);
        _9 = core::core_arch::simd::i8x16::splat(const 0_i8) -> [return: bb5, unwind continue];
    }

    bb5: {
        StorageLive(_11);
        _11 = <__m128i as core::core_arch::x86::m128iExt>::as_i8x16(move _4) -> [return: bb6, unwind continue];
    }

    bb6: {
        _10 = std::intrinsics::simd::simd_lt::<core::core_arch::simd::i8x16, core::core_arch::simd::i8x16>(move _11, move _9) -> [return: bb7, unwind unreachable];
    }

    bb7: {
        StorageDead(_11);
        StorageLive(_12);
        StorageLive(_13);
        _13 = simd_bitmask::<core::core_arch::simd::i8x16, u16>(move _10) -> [return: bb8, unwind unreachable];
    }

    bb8: {
        _12 = move _13 as u32 (IntToInt);
        StorageDead(_13);
        _0 = move _12 as i32 (IntToInt);
        StorageDead(_12);
        StorageDead(_10);
        StorageDead(_9);
        return;
    }
}

nikic commented 4 days ago

@heiher Without having looked at the details here, this usually means you need to implement the areInlineCompatible() TTI hook.

heiher commented 4 days ago

@heiher Without having looked at the details here, this usually means you need to implement the areInlineCompatible() TTI hook.

I agree with you. I just caught it.

heiher commented 4 days ago

EDIT: LLVM PR: https://github.com/llvm/llvm-project/pull/117493

rust-lang / rust

[LoongArch] SIMD intrinsics not fully inlined in caller with target feature globally enabled #133281

Meta