rust-lang / portable-simd

The testing ground for the future of portable SIMD in Rust
Apache License 2.0
889 stars 80 forks source link

Bounds check is not eliminated #356

Open SeeSpring opened 1 year ago

SeeSpring commented 1 year ago

I tried this code: Godbolt%0A++++++++++++%7C%7C+idx.simd_lt(Simd::splat(arr.len()+as+u32)).all()%0A++++)%3B%0A++++idx.as_array().iter().for_each(%7Ci%7C+arr%5B*i+as+usize%5D+%2B%3D+1)%3B%0A%7D'),l:'5',n:'0',o:'Rust+source+%231',t:'0')),k:50,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:r1700,deviceViewOpen:'1',filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:rust,libs:!(),options:'-C+opt-level%3D3+-C+target-cpu%3Dskylake',overrides:!((name:env,values:!((name:RUSTC_BOOTSTRAP,value:___1)))),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+rustc+1.70.0+(Editor+%231)',t:'0')),k:50,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',m:100,n:'0',o:'',t:'0')),version:4)

#![feature(portable_simd)]
use std::simd::*;
pub fn increment(idx: Simd<u32, 8>, arr: &mut [u32]) {
    assert!(
        (arr.len() >= u32::MAX as usize)
            || idx.simd_lt(Simd::splat(arr.len() as u32)).all()
    );
    idx.as_array().iter().for_each(|i| arr[*i as usize] += 1);
}

I expected to see this happen: Bounds checks are eliminated and there is the vpcmpltd instruction

Instead, this happened: Bounds checks are not eliminated and a vpmaxud and vpcmpeqd pair is used

Meta

RUSTC_BOOTSTRAP=1 rustc --version --verbose:

rustc 1.70.0 (90c541806 2023-05-31)
binary: rustc
commit-hash: 90c541806f23a127002de5b4038be731ba1458ca
commit-date: 2023-05-31
host: x86_64-unknown-linux-gnu
release: 1.70.0
LLVM version: 16.0.2
SeeSpring commented 1 year ago

Also vmovmskps and test should probably just be vtestps

calebzulawski commented 8 months ago

It appears that on latest Rust, the codegen is as expected, using vcmp + test: godbolt%0A++++++++++++%7C%7C+idx.simd_lt(Simd::splat(arr.len()+as+u32)).all()%0A++++)%3B%0A++++idx.as_array().iter().for_each(%7Ci%7C+arr%5B*i+as+usize%5D+%2B%3D+1)%3B%0A%7D'),l:'5',n:'0',o:'Rust+source+%231',t:'0')),k:33.335850856219906,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:nightly,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:2,lang:rust,libs:!(),options:'-C+opt-level%3D3+--edition%3D2021++-C+target-feature%3D%2Bsse4.1+',overrides:!((name:edition,value:'2021')),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+rustc+nightly+(Editor+%231)',t:'0'),(h:compiler,i:(compiler:nightly,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:rust,libs:!(),options:'-C+opt-level%3D3+-C+target-feature%3D%2Bavx512vl',overrides:!((name:edition,value:'2021')),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+rustc+nightly+(Editor+%231)',t:'0')),k:33.33081581044678,l:'4',m:100,n:'0',o:'',s:1,t:'0'),(g:!((h:output,i:(compilerName:'rustc+nightly',editorid:1,fontScale:14,fontUsePx:'0',j:1,wrap:'1'),l:'5',n:'0',o:'Output+of+rustc+nightly+(Compiler+%231)',t:'0')),k:33.33333333333333,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)

SeeSpring commented 8 months ago

There are still the bounds checks in the disassembly

.LBB0_2:
        mov     ecx, dword ptr [rax]
        mov     edi, ecx
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        mov     edi, dword ptr [rax + 4]
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        mov     edi, dword ptr [rax + 8]
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        mov     edi, dword ptr [rax + 12]
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        mov     edi, dword ptr [rax + 16]
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        mov     edi, dword ptr [rax + 20]
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        mov     edi, dword ptr [rax + 24]
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        mov     edi, dword ptr [rax + 28]
        cmp     rdi, rdx
        jae     .LBB0_12
        inc     dword ptr [rsi + 4*rdi]
        pop     rax
        vzeroupper
        ret

When using unsafe { idx.as_array().iter().for_each(|i| *arr.get_unchecked_mut(*i as usize) += 1); }

.LBB0_3:
        vmovd   eax, xmm0
        inc     dword ptr [rsi + 4*rax]
        vpextrd eax, xmm0, 1
        inc     dword ptr [rsi + 4*rax]
        vpextrd eax, xmm0, 2
        inc     dword ptr [rsi + 4*rax]
        vpextrd eax, xmm0, 3
        inc     dword ptr [rsi + 4*rax]
        vextracti128    xmm0, ymm0, 1
        vmovd   eax, xmm0
        inc     dword ptr [rsi + 4*rax]
        vpextrd eax, xmm0, 1
        inc     dword ptr [rsi + 4*rax]
        vpextrd eax, xmm0, 2
        inc     dword ptr [rsi + 4*rax]
        vpextrd eax, xmm0, 3
        inc     dword ptr [rsi + 4*rax]
        pop     rax
        vzeroupper
        ret
calebzulawski commented 8 months ago

I don't think that's particularly SIMD-related, that's [u32]'s Index trait. I understand technically that assert proves removing the check is allowed, but that's a relatively complex relationship that I'm not sure LLVM can handle.

calebzulawski commented 8 months ago

As a comparison, the trivial scalar implementation doesn't remove the bound checks either: godbolt