Possible suboptimal code generation for SIMD `any` function

Description

Test code:

func test_stdlib_8(_ input: SIMD8<UInt8>) -> Bool {
    any(input .== SIMD8(repeating: 0x42))
}

Building this with -O produces:

.LCPI1_0:
        .byte   66
        .byte   66
        .byte   66
        .byte   66
        .byte   66
        .byte   66
        .byte   66
        .byte   66
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
        .zero   1
output.test_stdlib_8(Swift.SIMD8<Swift.UInt8>) -> Swift.Bool:
        movq    xmm0, rdi
        pcmpeqb xmm0, xmmword ptr [rip + .LCPI1_0]
        pmovmskb        eax, xmm0
        test    al, al
        setne   al
        ret

Which is nice 👍

Unfortunately, when I widen the vector to 16+ elements, the any function becomes a massive, outlined, glob of code:

func test_stdlib_16(_ input: SIMD16<UInt8>) -> Bool {
    any(input .== SIMD16(repeating: 0x42))
}

.LCPI3_0:
        .zero   16,66
output.test_stdlib_16(Swift.SIMD16<Swift.UInt8>) -> Swift.Bool:
        push    rax
        pcmpeqb xmm0, xmmword ptr [rip + .LCPI3_0]
        call    (generic specialization <Swift.SIMD16<Swift.Int8>> of (extension in Swift):Swift.SIMD< where A.Scalar: Swift.Comparable>.min() -> A.Scalar)
        shr     al, 7
        pop     rcx
        ret

generic specialization <Swift.SIMD16<Swift.Int8>> of (extension in Swift):Swift.SIMD< where A.Scalar: Swift.Comparable>.min() -> A.Scalar:
        pshufd  xmm1, xmm0, 238
        movdqa  xmm2, xmm1
        pcmpgtb xmm2, xmm0
        pand    xmm0, xmm2
        pandn   xmm2, xmm1
        por     xmm2, xmm0
        pshufd  xmm0, xmm2, 85
        movdqa  xmm1, xmm0
        pcmpgtb xmm1, xmm2
        pand    xmm2, xmm1
        pandn   xmm1, xmm0
        por     xmm1, xmm2
        movdqa  xmm0, xmm1
        psrld   xmm0, 16
        movdqa  xmm2, xmm0
        pcmpgtb xmm2, xmm1
        pand    xmm1, xmm2
        pandn   xmm2, xmm0
        por     xmm2, xmm1
        movdqa  xmm0, xmm2
        psrlw   xmm0, 8
        movdqa  xmm1, xmm0
        pcmpgtb xmm1, xmm2
        pand    xmm2, xmm1
        pandn   xmm1, xmm0
        por     xmm1, xmm2
        movd    eax, xmm1
        ret

The SIMD mask is 16 bytes, and the any function basically amounts to mask != 0, so... even though I'm not an expert at SIMD instruction sets, it feels like this is probably not optimal.

Even if I enable all the advanced modern instruction sets I can think of (-O -Xcc -msse -Xcc -msse2 -Xcc -mavx -Xcc -mavx2), the code generated for the any function still feels suboptimal:

.LCPI5_0:
        .zero   16,128
generic specialization <Swift.SIMD16<Swift.Int8>> of (extension in Swift):Swift.SIMD< where A.Scalar: Swift.Comparable>.min() -> A.Scalar:
        vpxor   xmm0, xmm0, xmmword ptr [rip + .LCPI5_0]
        vpsrlw  xmm1, xmm0, 8
        vpminub xmm0, xmm0, xmm1
        vphminposuw     xmm0, xmm0
        vmovd   eax, xmm0
        add     al, -128
        ret

Reproduction

See above.

Also Godbolt%0A%7D%0A%0Afunc+test_stdlib16(+input:+SIMD16%3CUInt8%3E)+-%3E+Bool+%7B%0A++++any(input+.%3D%3D+SIMD16(repeating:+0x42))%0A%7D%0A%0Afunc+test_stdlib32(+input:+SIMD32%3CUInt8%3E)+-%3E+Bool+%7B%0A++++any(input+.%3D%3D+SIMD32(repeating:+0x42))%0A%7D'),l:'5',n:'0',o:'Swift+source+%231',t:'0')),k:51.483279086591,l:'4',n:'0',o:'',s:0,t:'0'),(g:!((h:compiler,i:(compiler:swiftnightly,filters:(b:'0',binary:'1',binaryObject:'1',commentOnly:'0',debugCalls:'1',demangle:'0',directives:'0',execute:'0',intel:'0',libraryCode:'0',trim:'1'),flagsViewOpen:'1',fontScale:14,fontUsePx:'0',j:1,lang:swift,libs:!(),options:'-O',overrides:!(),selection:(endColumn:1,endLineNumber:1,positionColumn:1,positionLineNumber:1,selectionStartColumn:1,selectionStartLineNumber:1,startColumn:1,startLineNumber:1),source:1),l:'5',n:'0',o:'+x86-64+swiftc+nightly+(Editor+%231)',t:'0')),k:48.516720913409,l:'4',n:'0',o:'',s:0,t:'0')),l:'2',n:'0',o:'',t:'0')),version:4)

Expected behavior

Intuitively, I would expect any(SIMDMask<SIMD16<Int16>>) to compile down to far fewer instructions than it does. At the very least, it seems it could be implemented using two 64-bit comparisons to zero, which I have to believe it more efficient than the code we're generating today.

Environment

Swift version 6.0-dev (LLVM d1625da873daa4c, Swift bae6450bf96dceb) Target: x86_64-unknown-linux-gnu

Additional information

No response

swiftlang / swift