perf: simd neon - Githubissues

AaronO commented 1 year ago

First pass at neon support, building off #132

I shared some promising benches here: https://github.com/seanmonstar/httparse/pull/123#issuecomment-1509870871

Overall it's a net-positive but not totally optimal.

Next steps we would want to land #123 and then refactor it into a SWAR SIMD backend and stack them appropriately to avoid duplicated work and cleaner logic.

Additionally, offsetz is very much central to total perf, I believe the vectorized implementation I added underperforms due to inefficiently reloading the offset lookup table instead of keeping it a register at the top of the header parsing loop.

AaronO commented 1 year ago

ASM of vectorized offsetz

In context of match_uri_vectored

.section __TEXT,__text,regular,pure_instructions
        .globl  httparse::simd::neon::match_uri_vectored
        .p2align        2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
        .cfi_startproc
        stp x29, x30, [sp, #-16]!
        .cfi_def_cfa_offset 16
        mov x29, sp
        .cfi_def_cfa w29, 16
        .cfi_offset w30, -8
        .cfi_offset w29, -16
        .cfi_remember_state
        mov x8, x0
        ldr x0, [x0, #16]
        ldp x9, x1, [x8]
        movi.16b v0, #129
        movi.16b v1, #162
        movi.16b v2, #253
        movi.16b v3, #60
Lloh22:
        adrp x10, lCPI14_0@PAGE
Lloh23:
        ldr q4, [x10, lCPI14_0@PAGEOFF]
        mov w10, #16
LBB14_1:
        subs x11, x1, x0
        b.lo LBB14_5
        cmp x11, #15
        b.ls LBB14_4
        ldr q5, [x9, x0]
        add.16b v6, v5, v0
        cmhi.16b v6, v1, v6
        and.16b v5, v5, v2
        cmeq.16b v5, v5, v3
        orr.16b v5, v6, v5
        and.16b v5, v5, v4
        umaxv.16b b5, v5
        fmov w11, s5
        sub w11, w10, w11, uxtb
        add x0, x11, x0
        str x0, [x8, #16]
        cmp w11, #16
        b.eq LBB14_1
LBB14_4:
        .cfi_def_cfa wsp, 16
        ldp x29, x30, [sp], #16
        .cfi_def_cfa_offset 0
        .cfi_restore w30
        .cfi_restore w29
        ret
LBB14_5:
        .cfi_restore_state
Lloh24:
        adrp x2, l___unnamed_15@PAGE
Lloh25:
        add x2, x2, l___unnamed_15@PAGEOFF
        bl core::slice::index::slice_start_index_len_fail
        .loh AdrpLdr    Lloh22, Lloh23
        .loh AdrpAdd    Lloh24, Lloh25

ASM of unvectorized offsetz

        .globl  httparse::simd::neon::match_uri_vectored
        .p2align        2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
        .cfi_startproc
        stp x29, x30, [sp, #-16]!
        .cfi_def_cfa_offset 16
        mov x29, sp
        .cfi_def_cfa w29, 16
        .cfi_offset w30, -8
        .cfi_offset w29, -16
        .cfi_remember_state
        ldp x1, x8, [x0, #8]
        ldr x9, [x0]
        movi.16b v0, #129
        movi.16b v1, #162
        movi.16b v2, #253
        movi.16b v3, #60
        b LBB14_2

        mov w10, #16
        add x8, x10, x8
        str x8, [x0, #16]
        cmp x10, #16
        b.ne LBB14_42
LBB14_2:
        subs x10, x1, x8
        b.lo LBB14_43
        cmp x10, #15
        b.ls LBB14_42
        ldr q4, [x9, x8]
        add.16b v5, v4, v0
        cmhi.16b v5, v1, v5
        and.16b v4, v4, v2
        cmeq.16b v4, v4, v3
        orr.16b v4, v5, v4
        fmov x10, d4
        cbnz x10, LBB14_24
        mov.d x10, v4[1]
        cbz x10, LBB14_1
        tst x10, #0xff
        b.eq LBB14_8
        mov x10, #0
        b LBB14_23
LBB14_8:
        tst x10, #0xff00
        b.eq LBB14_10
        mov w10, #1
        b LBB14_23
LBB14_10:
        tst x10, #0xff0000
        b.eq LBB14_12
        mov w10, #2
        b LBB14_23
LBB14_12:
        tst x10, #0xff000000
        b.eq LBB14_14
        mov w10, #3
        b LBB14_23
LBB14_14:
        tst x10, #0xff00000000
        b.eq LBB14_16
        mov w10, #4
        b LBB14_23
LBB14_16:
        tst x10, #0xff0000000000
        b.eq LBB14_18
        mov w10, #5
        b LBB14_23
LBB14_18:
        tst x10, #0xff000000000000
        b.eq LBB14_20
        mov w10, #6
        b LBB14_23
LBB14_20:
        lsr x10, x10, #56
        cbnz x10, LBB14_22
        mov w10, #8
        b LBB14_23

        mov w10, #7
LBB14_23:
        add x10, x10, #8
        add x8, x10, x8
        str x8, [x0, #16]
        cmp x10, #16
        b.eq LBB14_2
        b LBB14_42

        tst x10, #0xff
        b.eq LBB14_26
        mov x9, #0
        b LBB14_41
LBB14_26:
        tst x10, #0xff00
        b.eq LBB14_28
        mov w9, #1
        b LBB14_41
LBB14_28:
        tst x10, #0xff0000
        b.eq LBB14_30
        mov w9, #2
        b LBB14_41
LBB14_30:
        tst x10, #0xff000000
        b.eq LBB14_32
        mov w9, #3
        b LBB14_41
LBB14_32:
        tst x10, #0xff00000000
        b.eq LBB14_34
        mov w9, #4
        b LBB14_41
LBB14_34:
        tst x10, #0xff0000000000
        b.eq LBB14_36
        mov w9, #5
        b LBB14_41
LBB14_36:
        tst x10, #0xff000000000000
        b.eq LBB14_38
        mov w9, #6
        b LBB14_41
LBB14_38:
        lsr x9, x10, #56
        cbnz x9, LBB14_40
        mov w9, #8
        b LBB14_41

        mov w9, #7
LBB14_41:
        add x8, x9, x8
        str x8, [x0, #16]
LBB14_42:
        .cfi_def_cfa wsp, 16
        ldp x29, x30, [sp], #16
        .cfi_def_cfa_offset 0
        .cfi_restore w30
        .cfi_restore w29
        ret
LBB14_43:
        .cfi_restore_state
Lloh22:
        adrp x2, l___unnamed_15@PAGE
Lloh23:
        add x2, x2, l___unnamed_15@PAGEOFF
        mov x0, x8
        bl core::slice::index::slice_start_index_len_fail
        .loh AdrpAdd    Lloh22, Lloh23

AaronO commented 1 year ago

@seanmonstar I've resolved the conflicts, this should be good to land it's not perfect but it's a solid first pass, a few specifics to revisit later.

AaronO commented 1 year ago

Adding a new arch/target, it'd be wise of us to add a CI job to exercise it too. I think some of the other modules include some tests that are checking that the matching behavior is the same as the scalar version.

I've added a CI job to run aarch64/neon tests with qemu. I added tests in neon.rs that ensure the vectorized versions math their scalar counterparts. (Only some of the SWAR validators have false-negatives with a fallback to scalar check so correct in aggregate, since testing for the common-cases can be cheaper than exhaustive checks)

seanmonstar / httparse

perf: simd neon #133

ASM of vectorized offsetz

ASM of unvectorized offsetz