Closed AaronO closed 1 year ago
In context of match_uri_vectored
.section __TEXT,__text,regular,pure_instructions
.globl httparse::simd::neon::match_uri_vectored
.p2align 2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
mov x8, x0
ldr x0, [x0, #16]
ldp x9, x1, [x8]
movi.16b v0, #129
movi.16b v1, #162
movi.16b v2, #253
movi.16b v3, #60
Lloh22:
adrp x10, lCPI14_0@PAGE
Lloh23:
ldr q4, [x10, lCPI14_0@PAGEOFF]
mov w10, #16
LBB14_1:
subs x11, x1, x0
b.lo LBB14_5
cmp x11, #15
b.ls LBB14_4
ldr q5, [x9, x0]
add.16b v6, v5, v0
cmhi.16b v6, v1, v6
and.16b v5, v5, v2
cmeq.16b v5, v5, v3
orr.16b v5, v6, v5
and.16b v5, v5, v4
umaxv.16b b5, v5
fmov w11, s5
sub w11, w10, w11, uxtb
add x0, x11, x0
str x0, [x8, #16]
cmp w11, #16
b.eq LBB14_1
LBB14_4:
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
LBB14_5:
.cfi_restore_state
Lloh24:
adrp x2, l___unnamed_15@PAGE
Lloh25:
add x2, x2, l___unnamed_15@PAGEOFF
bl core::slice::index::slice_start_index_len_fail
.loh AdrpLdr Lloh22, Lloh23
.loh AdrpAdd Lloh24, Lloh25
.globl httparse::simd::neon::match_uri_vectored
.p2align 2
httparse::simd::neon::match_uri_vectored:
Lfunc_begin14:
.cfi_startproc
stp x29, x30, [sp, #-16]!
.cfi_def_cfa_offset 16
mov x29, sp
.cfi_def_cfa w29, 16
.cfi_offset w30, -8
.cfi_offset w29, -16
.cfi_remember_state
ldp x1, x8, [x0, #8]
ldr x9, [x0]
movi.16b v0, #129
movi.16b v1, #162
movi.16b v2, #253
movi.16b v3, #60
b LBB14_2
mov w10, #16
add x8, x10, x8
str x8, [x0, #16]
cmp x10, #16
b.ne LBB14_42
LBB14_2:
subs x10, x1, x8
b.lo LBB14_43
cmp x10, #15
b.ls LBB14_42
ldr q4, [x9, x8]
add.16b v5, v4, v0
cmhi.16b v5, v1, v5
and.16b v4, v4, v2
cmeq.16b v4, v4, v3
orr.16b v4, v5, v4
fmov x10, d4
cbnz x10, LBB14_24
mov.d x10, v4[1]
cbz x10, LBB14_1
tst x10, #0xff
b.eq LBB14_8
mov x10, #0
b LBB14_23
LBB14_8:
tst x10, #0xff00
b.eq LBB14_10
mov w10, #1
b LBB14_23
LBB14_10:
tst x10, #0xff0000
b.eq LBB14_12
mov w10, #2
b LBB14_23
LBB14_12:
tst x10, #0xff000000
b.eq LBB14_14
mov w10, #3
b LBB14_23
LBB14_14:
tst x10, #0xff00000000
b.eq LBB14_16
mov w10, #4
b LBB14_23
LBB14_16:
tst x10, #0xff0000000000
b.eq LBB14_18
mov w10, #5
b LBB14_23
LBB14_18:
tst x10, #0xff000000000000
b.eq LBB14_20
mov w10, #6
b LBB14_23
LBB14_20:
lsr x10, x10, #56
cbnz x10, LBB14_22
mov w10, #8
b LBB14_23
mov w10, #7
LBB14_23:
add x10, x10, #8
add x8, x10, x8
str x8, [x0, #16]
cmp x10, #16
b.eq LBB14_2
b LBB14_42
tst x10, #0xff
b.eq LBB14_26
mov x9, #0
b LBB14_41
LBB14_26:
tst x10, #0xff00
b.eq LBB14_28
mov w9, #1
b LBB14_41
LBB14_28:
tst x10, #0xff0000
b.eq LBB14_30
mov w9, #2
b LBB14_41
LBB14_30:
tst x10, #0xff000000
b.eq LBB14_32
mov w9, #3
b LBB14_41
LBB14_32:
tst x10, #0xff00000000
b.eq LBB14_34
mov w9, #4
b LBB14_41
LBB14_34:
tst x10, #0xff0000000000
b.eq LBB14_36
mov w9, #5
b LBB14_41
LBB14_36:
tst x10, #0xff000000000000
b.eq LBB14_38
mov w9, #6
b LBB14_41
LBB14_38:
lsr x9, x10, #56
cbnz x9, LBB14_40
mov w9, #8
b LBB14_41
mov w9, #7
LBB14_41:
add x8, x9, x8
str x8, [x0, #16]
LBB14_42:
.cfi_def_cfa wsp, 16
ldp x29, x30, [sp], #16
.cfi_def_cfa_offset 0
.cfi_restore w30
.cfi_restore w29
ret
LBB14_43:
.cfi_restore_state
Lloh22:
adrp x2, l___unnamed_15@PAGE
Lloh23:
add x2, x2, l___unnamed_15@PAGEOFF
mov x0, x8
bl core::slice::index::slice_start_index_len_fail
.loh AdrpAdd Lloh22, Lloh23
@seanmonstar I've resolved the conflicts, this should be good to land it's not perfect but it's a solid first pass, a few specifics to revisit later.
Adding a new arch/target, it'd be wise of us to add a CI job to exercise it too. I think some of the other modules include some tests that are checking that the matching behavior is the same as the scalar version.
I've added a CI job to run aarch64
/neon
tests with qemu
. I added tests in neon.rs
that ensure the vectorized versions math their scalar counterparts. (Only some of the SWAR validators have false-negatives with a fallback to scalar check so correct in aggregate, since testing for the common-cases can be cheaper than exhaustive checks)
First pass at neon support, building off #132
I shared some promising benches here: https://github.com/seanmonstar/httparse/pull/123#issuecomment-1509870871
Overall it's a net-positive but not totally optimal.
Next steps we would want to land #123 and then refactor it into a SWAR SIMD backend and stack them appropriately to avoid duplicated work and cleaner logic.
Additionally, offsetz is very much central to total perf, I believe the vectorized implementation I added underperforms due to inefficiently reloading the offset lookup table instead of keeping it a register at the top of the header parsing loop.