.LCPI3_0:
.zero 16,66
output.test_stdlib_16(Swift.SIMD16<Swift.UInt8>) -> Swift.Bool:
push rax
pcmpeqb xmm0, xmmword ptr [rip + .LCPI3_0]
call (generic specialization <Swift.SIMD16<Swift.Int8>> of (extension in Swift):Swift.SIMD< where A.Scalar: Swift.Comparable>.min() -> A.Scalar)
shr al, 7
pop rcx
ret
generic specialization <Swift.SIMD16<Swift.Int8>> of (extension in Swift):Swift.SIMD< where A.Scalar: Swift.Comparable>.min() -> A.Scalar:
pshufd xmm1, xmm0, 238
movdqa xmm2, xmm1
pcmpgtb xmm2, xmm0
pand xmm0, xmm2
pandn xmm2, xmm1
por xmm2, xmm0
pshufd xmm0, xmm2, 85
movdqa xmm1, xmm0
pcmpgtb xmm1, xmm2
pand xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movdqa xmm0, xmm1
psrld xmm0, 16
movdqa xmm2, xmm0
pcmpgtb xmm2, xmm1
pand xmm1, xmm2
pandn xmm2, xmm0
por xmm2, xmm1
movdqa xmm0, xmm2
psrlw xmm0, 8
movdqa xmm1, xmm0
pcmpgtb xmm1, xmm2
pand xmm2, xmm1
pandn xmm1, xmm0
por xmm1, xmm2
movd eax, xmm1
ret
The SIMD mask is 16 bytes, and the any function basically amounts to mask != 0, so... even though I'm not an expert at SIMD instruction sets, it feels like this is probably not optimal.
Even if I enable all the advanced modern instruction sets I can think of (-O -Xcc -msse -Xcc -msse2 -Xcc -mavx -Xcc -mavx2), the code generated for the any function still feels suboptimal:
Also Godbolt%0A%7D%0A%0Afunc+test_stdlib16(+input:+SIMD16%3CUInt8%3E)+-%3E+Bool+%7B%0A++++any(input+.%3D%3D+SIMD16(repeating:+0x42))%0A%7D%0A%0Afunc+test_stdlib32(+input:+SIMD32%3CUInt8%3E)+-%3E+Bool+%7B%0A++++any(input+.%3D%3D+SIMD32(repeating:+0x42))%0A%7D'),l:'5',n:'0',o:'Swift+source+%231',t:'0')),k:51.483279086591,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:swiftnightly,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'0',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:swift,libs:!(),options:'-O',overrides:!(),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+x86-64+swiftc+nightly+(Editor+%231)',t:'0')),k:48.516720913409,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)
Expected behavior
Intuitively, I would expect any(SIMDMask<SIMD16<Int16>>) to compile down to far fewer instructions than it does. At the very least, it seems it could be implemented using two 64-bit comparisons to zero, which I have to believe it more efficient than the code we're generating today.
Environment
Swift version 6.0-dev (LLVM d1625da873daa4c, Swift bae6450bf96dceb)
Target: x86_64-unknown-linux-gnu
Description
Test code:
Building this with
-O
produces:Which is nice 👍
Unfortunately, when I widen the vector to 16+ elements, the
any
function becomes a massive, outlined, glob of code:The SIMD mask is 16 bytes, and the
any
function basically amounts tomask != 0
, so... even though I'm not an expert at SIMD instruction sets, it feels like this is probably not optimal.Even if I enable all the advanced modern instruction sets I can think of (
-O -Xcc -msse -Xcc -msse2 -Xcc -mavx -Xcc -mavx2
), the code generated for theany
function still feels suboptimal:Reproduction
See above.
Also Godbolt%0A%7D%0A%0Afunc+test_stdlib16(+input:+SIMD16%3CUInt8%3E)+-%3E+Bool+%7B%0A++++any(input+.%3D%3D+SIMD16(repeating:+0x42))%0A%7D%0A%0Afunc+test_stdlib32(+input:+SIMD32%3CUInt8%3E)+-%3E+Bool+%7B%0A++++any(input+.%3D%3D+SIMD32(repeating:+0x42))%0A%7D'),l:'5',n:'0',o:'Swift+source+%231',t:'0')),k:51.483279086591,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:swiftnightly,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'0',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:swift,libs:!(),options:'-O',overrides:!(),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+x86-64+swiftc+nightly+(Editor+%231)',t:'0')),k:48.516720913409,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)
Expected behavior
Intuitively, I would expect
any(SIMDMask<SIMD16<Int16>>)
to compile down to far fewer instructions than it does. At the very least, it seems it could be implemented using two 64-bit comparisons to zero, which I have to believe it more efficient than the code we're generating today.Environment
Swift version 6.0-dev (LLVM d1625da873daa4c, Swift bae6450bf96dceb) Target: x86_64-unknown-linux-gnu
Additional information
No response