p12tic / libsimdpp

Portable header-only C++ low level SIMD library
Boost Software License 1.0
1.24k stars 129 forks source link

insn: Work around in GCC regression when generating comparisons on ARM #178

Closed p12tic closed 3 months ago

p12tic commented 3 months ago

Starting with GCC 11 and as of GCC 14.1, the following code:

uint32x4_t test(float32x4_t a, float32x4_t b)
{
    return vcgtq_f32(a, b);
}

Generates the following assembly on armv7:

test(__simd128_float32_t, __simd128_float32_t):
        vmov.32 r3, d0[0]
        sub     sp, sp, #16
        vmov    s8, r3
        vmov.32 r3, d0[1]
        vmov    s10, r3
        vmov.32 r3, d1[0]
        vmov    s12, r3
        vmov.32 r3, d1[1]
        vmov    s14, r3
        vmov.32 r3, d2[0]
        vmov    s9, r3
        vmov.32 r3, d2[1]
        vcmpe.f32       s8, s9
        vmov    s11, r3
        vmov.32 r3, d3[0]
        vmrs    APSR_nzcv, FPSCR
        vcmpe.f32       s10, s11
        vmov    s13, r3
        vmov.32 r3, d3[1]
        vmov    s15, r3
        ite     gt
        movgt   r3, #-1
        movle   r3, #0
        vmrs    APSR_nzcv, FPSCR
        vcmpe.f32       s12, s13
        str     r3, [sp]
        ite     gt
        movgt   r3, #-1
        movle   r3, #0
        vmrs    APSR_nzcv, FPSCR
        vcmpe.f32       s14, s15
        str     r3, [sp, #4]
        ite     gt
        movgt   r3, #-1
        movle   r3, #0
        vmrs    APSR_nzcv, FPSCR
        str     r3, [sp, #8]
        ite     gt
        movgt   r3, #-1
        movle   r3, #0
        str     r3, [sp, #12]
        vld1.64 {d0-d1}, [sp:64]
        add     sp, sp, #16
        bx      lr