dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
14.23k stars 4.45k forks source link

Refactor AdvSimd version of DecodeFromUTF8 #101620

Closed SwapnilGaikwad closed 4 days ago

SwapnilGaikwad commented 2 weeks ago
SwapnilGaikwad commented 2 weeks ago

@a74nh @kunalspathak @dotnet/arm64-contrib

SwapnilGaikwad commented 2 weeks ago

There is no notable performance difference on a V1 and N1 system for this patch. There is reordering of assembly sequence with the newer version having an instruction less.

Assembly sequence for DecodeFromUtf8 ``` b.cc 0xffffa8338394 // b.lo, b.ul, b.last ldr q8, 0xffffa83388c0 str q8, [x29, #96] ldr q9, 0xffffa83388d0 str q9, [x29, #80] ldr q10, 0xffffa83388e0 str q10, [x29, #64] ldr q11, 0xffffa83388f0 str q11, [x29, #48] ldr q12, 0xffffa8338900 str q12, [x29, #32] ldr q13, 0xffffa8338910 str q13, [x29, #16] mov x6, x27 str x28, [x29, #272] ldr q14, 0xffffa8338920 str q14, [x29, #256] str w4, [x29, #344] mov w2, w4 str x6, [x29, #280] mov x0, x6 mov x1, x27 adrp x11, 0xffffa8e3a000 add x11, x11, #0x918 mov v14.d[0], v8.d[1] mov v10.d[0], v9.d[1] ldr x13, [x11] blr x13 ldr x6, [x29, #280] ld4 {v16.16b-v19.16b}, [x6] stp q16, q17, [x29, #192] stp q18, q19, [x29, #224] ldp q16, q17, [x29, #192] ldp q18, q19, [x29, #224] mvni v20.4s, #0x0 mvni v21.4s, #0x0 mov v8.d[1], v14.d[0] mov v9.d[1], v10.d[0] mov v22.16b, v8.16b mov v23.16b, v9.16b tbl v20.16b, {v20.16b-v23.16b}, v16.16b mvni v21.4s, #0x0 mvni v22.4s, #0x0 mov v23.16b, v21.16b mov v24.16b, v22.16b mov v25.16b, v8.16b mov v26.16b, v9.16b tbl v21.16b, {v23.16b-v26.16b}, v17.16b mvni v22.4s, #0x0 mvni v23.4s, #0x0 mov v24.16b, v22.16b mov v25.16b, v23.16b mov v26.16b, v8.16b mov v27.16b, v9.16b tbl v22.16b, {v24.16b-v27.16b}, v18.16b mvni v23.4s, #0x0 mvni v24.4s, #0x0 mov v25.16b, v23.16b mov v26.16b, v24.16b mov v27.16b, v8.16b mov v28.16b, v9.16b tbl v23.16b, {v25.16b-v28.16b}, v19.16b ldr q24, [x29, #256] uqsub v16.16b, v16.16b, v24.16b uqsub v17.16b, v17.16b, v24.16b uqsub v18.16b, v18.16b, v24.16b uqsub v19.16b, v19.16b, v24.16b ldp q26, q25, [x29, #48] ldp q28, q27, [x29, #16] tbx v16.16b, {v25.16b-v28.16b}, v16.16b tbx v17.16b, {v25.16b-v28.16b}, v17.16b tbx v18.16b, {v25.16b-v28.16b}, v18.16b tbx v19.16b, {v25.16b-v28.16b}, v19.16b orr v16.16b, v20.16b, v16.16b orr v17.16b, v21.16b, v17.16b orr v18.16b, v22.16b, v18.16b orr v19.16b, v23.16b, v19.16b cmhi v20.16b, v16.16b, v24.16b cmhi v21.16b, v17.16b, v24.16b orr v20.16b, v20.16b, v21.16b cmhi v21.16b, v18.16b, v24.16b orr v20.16b, v20.16b, v21.16b cmhi v21.16b, v19.16b, v24.16b orr v20.16b, v20.16b, v21.16b umaxp v20.4s, v20.4s, v20.4s mov x2, v20.d[0] cmp x2, #0x0 b.ne 0xffffa833836c // b.any shl v16.16b, v16.16b, #2 ushr v20.16b, v17.16b, #4 orr v10.16b, v16.16b, v20.16b shl v16.16b, v17.16b, #4 ushr v17.16b, v18.16b, #2 orr v11.16b, v16.16b, v17.16b shl v16.16b, v18.16b, #6 orr v12.16b, v16.16b, v19.16b mov w2, w19 ldr x0, [x29, #272] mov x1, x28 adrp x11, 0xffffa8e3a000 add x11, x11, #0x920 mov v13.d[0], v10.d[1] mov v8.d[0], v11.d[1] mov v9.d[0], v12.d[1] ldr x3, [x11] blr x3 mov v10.d[1], v13.d[0] mov v11.d[1], v8.d[0] mov v12.d[1], v9.d[0] ldr x7, [x29, #272] st3 {v10.16b-v12.16b}, [x7] ldr x6, [x29, #280] add x6, x6, #0x40 add x7, x7, #0x30 ldr x3, [x29, #288] cmp x6, x3 str x7, [x29, #272] str x3, [x29, #288] ldp q9, q8, [x29, #80] b.ls 0xffffa8338568 // b.plast str x6, [x29, #280] ldr x6, [x29, #280] mov x4, x6 ldr x7, [x29, #272] mov x5, x7 ldr x6, [x29, #312] cmp x4, x6 b.eq 0xffffa833875c ```
dotnet-policy-service[bot] commented 6 days ago

Tagging subscribers to this area: @dotnet/area-system-buffers See info in area-owners.md if you want to be subscribed.