Open SeeSpring opened 1 year ago
Also vmovmskps
and test
should probably just be vtestps
It appears that on latest Rust, the codegen is as expected, using vcmp + test: godbolt%0A++++++++++++%7C%7C+idx.simd_lt(Simd::splat(arr.len()+as+u32)).all()%0A++++)%3B%0A++++idx.as_array().iter().for_each(%7Ci%7C+arr%5B*i+as+usize%5D+%2B%3D+1)%3B%0A%7D'),l:'5',n:'0',o:'Rust+source+%231',t:'0')),k:33.335850856219906,l:'4',m:100,n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:nightly,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:2,lang:rust,libs:!(),options:'-C+opt-level%3D3+--edition%3D2021++-C+target-feature%3D%2Bsse4.1+',overrides:!((name:edition,value:'2021')),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+rustc+nightly+(Editor+%231)',t:'0'),(h:compiler,i:(compiler:nightly,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:rust,libs:!(),options:'-C+opt-level%3D3+-C+target-feature%3D%2Bavx512vl',overrides:!((name:edition,value:'2021')),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+rustc+nightly+(Editor+%231)',t:'0')),k:33.33081581044678,l:'4',m:100,n:'0',o:'',s:1,t:'0'),(g:!((h:output,i:(compilerName:'rustc+nightly',editorid:1,fontScale:14,fontUsePx:'0',j:1,wrap:'1'),l:'5',n:'0',o:'Output+of+rustc+nightly+(Compiler+%231)',t:'0')),k:33.33333333333333,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)
There are still the bounds checks in the disassembly
.LBB0_2:
mov ecx, dword ptr [rax]
mov edi, ecx
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
mov edi, dword ptr [rax + 4]
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
mov edi, dword ptr [rax + 8]
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
mov edi, dword ptr [rax + 12]
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
mov edi, dword ptr [rax + 16]
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
mov edi, dword ptr [rax + 20]
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
mov edi, dword ptr [rax + 24]
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
mov edi, dword ptr [rax + 28]
cmp rdi, rdx
jae .LBB0_12
inc dword ptr [rsi + 4*rdi]
pop rax
vzeroupper
ret
When using unsafe { idx.as_array().iter().for_each(|i| *arr.get_unchecked_mut(*i as usize) += 1); }
.LBB0_3:
vmovd eax, xmm0
inc dword ptr [rsi + 4*rax]
vpextrd eax, xmm0, 1
inc dword ptr [rsi + 4*rax]
vpextrd eax, xmm0, 2
inc dword ptr [rsi + 4*rax]
vpextrd eax, xmm0, 3
inc dword ptr [rsi + 4*rax]
vextracti128 xmm0, ymm0, 1
vmovd eax, xmm0
inc dword ptr [rsi + 4*rax]
vpextrd eax, xmm0, 1
inc dword ptr [rsi + 4*rax]
vpextrd eax, xmm0, 2
inc dword ptr [rsi + 4*rax]
vpextrd eax, xmm0, 3
inc dword ptr [rsi + 4*rax]
pop rax
vzeroupper
ret
I don't think that's particularly SIMD-related, that's [u32]
's Index
trait. I understand technically that assert
proves removing the check is allowed, but that's a relatively complex relationship that I'm not sure LLVM can handle.
As a comparison, the trivial scalar implementation doesn't remove the bound checks either: godbolt
I tried this code: Godbolt%0A++++++++++++%7C%7C+idx.simd_lt(Simd::splat(arr.len()+as+u32)).all()%0A++++)%3B%0A++++idx.as_array().iter().for_each(%7Ci%7C+arr%5B*i+as+usize%5D+%2B%3D+1)%3B%0A%7D'),l:'5',n:'0',o:'Rust+source+%231',t:'0')),k:50,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:r1700,deviceViewOpen:'1',filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'1',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:rust,libs:!(),options:'-C+opt-level%3D3+-C+target-cpu%3Dskylake',overrides:!((name:env,values:!((name:RUSTC_BOOTSTRAP,value:___1)))),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+rustc+1.70.0+(Editor+%231)',t:'0')),k:50,l:'4',m:100,n:'0',o:'',s:0,t:'0')),l:'2',m:100,n:'0',o:'',t:'0')),version:4)
I expected to see this happen: Bounds checks are eliminated and there is the
vpcmpltd
instructionInstead, this happened: Bounds checks are not eliminated and a
vpmaxud
andvpcmpeqd
pair is usedMeta
RUSTC_BOOTSTRAP=1 rustc --version --verbose
: