MihuBot / runtime-utils

0 stars 0 forks source link

[JitDiff X64] xtqqczze/dotnet-runtime/BitCastFill #485

Open MihuBot opened 3 months ago

MihuBot commented 3 months ago

Job completed in 14 minutes.

Diffs

Found 261 files with textual diffs.

Summary of Code Size diffs:
(Lower is better)

Total bytes of base: 41568628
Total bytes of diff: 41568463
Total bytes of delta: -165 (-0.00 % of base)
Total relative delta: -0.62
    diff is an improvement.
    relative diff is an improvement.

Top file improvements (bytes):
        -165 : System.Private.CoreLib.dasm (-0.00 % of base)

1 total files with Code Size differences (1 improved, 0 regressed), 258 unchanged.

Top method improvements (bytes):
        -106 (-37.46 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts)
         -25 (-9.09 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
         -15 (-6.10 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts)
         -11 (-5.91 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts)
          -4 (-1.53 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts)
          -4 (-1.57 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)

Top method improvements (percentages):
        -106 (-37.46 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts)
         -25 (-9.09 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
         -15 (-6.10 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts)
         -11 (-5.91 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts)
          -4 (-1.57 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
          -4 (-1.53 % of base) : System.Private.CoreLib.dasm - System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts)

6 total methods with Code Size differences (6 improved, 0 regressed), 252735 unchanged.

--------------------------------------------------------------------------------

Artifacts:

MihuBot commented 3 months ago

Top method improvements

-106 (-37.46 % of base) - System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) ```diff ; Assembly listing for method System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts) ; Emitting BLENDED_CODE for X64 with AVX512 - Unix ; FullOpts code ; optimized code ; rbp based frame ; fully interruptible ; No PGO data -; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data +; 0 inlinees with PGO data; 1 single block inlinees; 3 inlinees without PGO data ; Final local variable assignments ; -; V00 arg0 [V00,T00] ( 18, 38 ) byref -> rdi single-def -; V01 arg1 [V01,T07] ( 10, 6.50) long -> rsi single-def -; V02 arg2 [V02,T01] ( 18, 38 ) struct ( 8) rdx single-def -; V03 loc0 [V03,T04] ( 12, 20 ) long -> rax -; V04 loc1 [V04 ] ( 2, 1 ) struct ( 8) [rbp-0x08] do-not-enreg[SF] ld-addr-op -; V05 loc2 [V05,T17] ( 5, 9.50) simd32 -> mm0 ld-addr-op -; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def -; V07 loc4 [V07,T12] ( 4, 2 ) long -> rax -; V08 loc5 [V08,T09] ( 2, 4.50) long -> rcx -; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx -;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref -; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx -;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" -; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill" -; V14 tmp2 [V14 ] ( 6, 6 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" -;* V15 tmp3 [V15 ] ( 0, 0 ) simd32 -> zero-ref -;* V16 tmp4 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" -;* V17 tmp5 [V17,T16] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp" -; V18 tmp6 [V18,T08] ( 5, 5 ) long -> rdx "Inlining Arg" -; V19 tmp7 [V19,T14] ( 2, 1 ) ubyte -> [rbp-0x08] do-not-enreg[] "field V04.hasValue (fldOffset=0x0)" P-DEP -; V20 tmp8 [V20,T15] ( 2, 1 ) int -> [rbp-0x04] do-not-enreg[] "field V04.value (fldOffset=0x4)" P-DEP -; V21 cse0 [V21,T02] ( 9, 36 ) long -> r8 "CSE #01: aggressive" -; V22 cse1 [V22,T11] ( 5, 2.50) long -> rcx "CSE #02: moderate" -; V23 cse2 [V23,T13] ( 3, 1.50) long -> rcx "CSE #03: moderate" +; V00 arg0 [V00,T00] ( 17, 37.50) byref -> rdi single-def +; V01 arg1 [V01,T05] ( 8, 7.50) long -> rsi single-def +; V02 arg2 [V02,T01] ( 17, 37.50) struct ( 8) rdx single-def +; V03 loc0 [V03,T03] ( 12, 20.50) long -> rax +;* V04 loc1 [V04 ] ( 0, 0 ) simd32 -> zero-ref +;* V05 loc2 [V05 ] ( 0, 0 ) byref -> zero-ref +;* V06 loc3 [V06 ] ( 0, 0 ) long -> zero-ref +;* V07 loc4 [V07 ] ( 0, 0 ) long -> zero-ref +;* V08 loc5 [V08 ] ( 0, 0 ) long -> zero-ref +; V09 loc6 [V09,T06] ( 2, 4.50) long -> rcx +;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" +; V11 tmp1 [V11,T04] ( 2, 16 ) long -> rax "dup spill" +;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument" +;* V13 tmp3 [V13 ] ( 0, 0 ) simd32 -> zero-ref +;* V14 tmp4 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" +;* V15 tmp5 [V15 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" +;* V16 tmp6 [V16 ] ( 0, 0 ) struct ( 8) zero-ref do-not-enreg[SF] ld-addr-op "Inlining Arg" +;* V17 tmp7 [V17 ] ( 0, 0 ) long -> zero-ref ld-addr-op "Inline ldloca(s) first use temp" +;* V18 tmp8 [V18 ] ( 0, 0 ) simd32 -> zero-ref do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" +;* V19 tmp9 [V19 ] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp" +;* V20 tmp10 [V20 ] ( 0, 0 ) long -> zero-ref "Inlining Arg" +;* V21 tmp11 [V21 ] ( 0, 0 ) ubyte -> zero-ref "field V15.hasValue (fldOffset=0x0)" P-INDEP +;* V22 tmp12 [V22 ] ( 0, 0 ) int -> zero-ref "field V15.value (fldOffset=0x4)" P-INDEP +;* V23 tmp13 [V23 ] ( 0, 0 ) ubyte -> zero-ref do-not-enreg[] "field V16.hasValue (fldOffset=0x0)" P-DEP +;* V24 tmp14 [V24 ] ( 0, 0 ) int -> zero-ref do-not-enreg[] "field V16.value (fldOffset=0x4)" P-DEP +; V25 cse0 [V25,T02] ( 9, 36 ) long -> r8 "CSE #01: aggressive" +; V26 cse1 [V26,T07] ( 5, 2.50) long -> rcx "CSE #02: aggressive" +; V27 cse2 [V27,T08] ( 3, 1.50) long -> rcx "CSE #03: moderate" ; -; Lcl frame size = 48 +; Lcl frame size = 0 G_M56207_IG01: push rbp - sub rsp, 48 - lea rbp, [rsp+0x30] - ;; size=10 bbWeight=1 PerfScore 1.75 + mov rbp, rsp + ;; size=4 bbWeight=1 PerfScore 1.25 G_M56207_IG02: cmp rsi, 4 - jb SHORT G_M56207_IG08 - ;; size=6 bbWeight=1 PerfScore 1.25 -G_M56207_IG03: - mov qword ptr [rbp-0x08], rdx - vxorps ymm0, ymm0, ymm0 - vmovups ymmword ptr [rbp-0x30], ymm0 - mov rdx, qword ptr [rbp-0x08] - mov qword ptr [rbp-0x30], rdx - mov qword ptr [rbp-0x28], rdx - mov qword ptr [rbp-0x20], rdx - mov qword ptr [rbp-0x18], rdx - vmovups ymm0, ymmword ptr [rbp-0x30] - lea rax, [8*rsi] - mov rcx, rax - and rcx, -64 - xor edx, edx - cmp rsi, 8 - jb SHORT G_M56207_IG05 - align [3 bytes for IG04] - ;; size=64 bbWeight=0.50 PerfScore 7.04 -G_M56207_IG04: - vmovups ymmword ptr [rdi+rdx], ymm0 - vmovups ymmword ptr [rdi+rdx+0x20], ymm0 - add rdx, 64 - cmp rdx, rcx - jb SHORT G_M56207_IG04 - ;; size=20 bbWeight=4 PerfScore 22.00 -G_M56207_IG05: - test al, 32 - je SHORT G_M56207_IG06 - vmovups ymmword ptr [rdi+rdx], ymm0 - ;; size=9 bbWeight=0.50 PerfScore 1.62 -G_M56207_IG06: - vmovups ymmword ptr [rdi+rax-0x20], ymm0 - ;; size=6 bbWeight=0.50 PerfScore 1.00 -G_M56207_IG07: - vzeroupper - add rsp, 48 - pop rbp - ret - ;; size=9 bbWeight=0.50 PerfScore 1.38 -G_M56207_IG08: + jae G_M56207_IG12 xor eax, eax cmp rsi, 8 - jb SHORT G_M56207_IG10 + jb SHORT G_M56207_IG05 + ;; size=18 bbWeight=1 PerfScore 2.75 +G_M56207_IG03: mov rcx, rsi and rcx, -8 - align [5 bytes for IG09] - ;; size=20 bbWeight=0.50 PerfScore 1.12 -G_M56207_IG09: + align [3 bytes for IG04] + ;; size=10 bbWeight=0.50 PerfScore 0.38 +G_M56207_IG04: lea r8, [8*rax] mov qword ptr [rdi+r8], rdx mov qword ptr [rdi+r8+0x08], rdx mov qword ptr [rdi+r8+0x10], rdx mov qword ptr [rdi+r8+0x18], rdx mov qword ptr [rdi+r8+0x20], rdx mov qword ptr [rdi+r8+0x28], rdx mov qword ptr [rdi+r8+0x30], rdx mov qword ptr [rdi+r8+0x38], rdx add rax, 8 cmp rax, rcx - jb SHORT G_M56207_IG09 + jb SHORT G_M56207_IG04 ;; size=56 bbWeight=4 PerfScore 40.00 -G_M56207_IG10: +G_M56207_IG05: test sil, 4 - je SHORT G_M56207_IG11 + je SHORT G_M56207_IG07 + ;; size=6 bbWeight=1 PerfScore 1.25 +G_M56207_IG06: lea rcx, [8*rax] mov qword ptr [rdi+rcx], rdx mov qword ptr [rdi+rcx+0x08], rdx mov qword ptr [rdi+rcx+0x10], rdx mov qword ptr [rdi+rcx+0x18], rdx add rax, 4 - ;; size=37 bbWeight=0.50 PerfScore 3.00 -G_M56207_IG11: + ;; size=31 bbWeight=0.50 PerfScore 2.38 +G_M56207_IG07: test sil, 2 - je SHORT G_M56207_IG12 + je SHORT G_M56207_IG09 + ;; size=6 bbWeight=1 PerfScore 1.25 +G_M56207_IG08: lea rcx, [8*rax] mov qword ptr [rdi+rcx], rdx mov qword ptr [rdi+rcx+0x08], rdx add rax, 2 - ;; size=27 bbWeight=0.50 PerfScore 2.00 -G_M56207_IG12: + ;; size=21 bbWeight=0.50 PerfScore 1.38 +G_M56207_IG09: test sil, 1 - je SHORT G_M56207_IG13 + je SHORT G_M56207_IG11 + ;; size=6 bbWeight=1 PerfScore 1.25 +G_M56207_IG10: mov qword ptr [rdi+8*rax], rdx - ;; size=10 bbWeight=0.50 PerfScore 1.12 -G_M56207_IG13: - vzeroupper - add rsp, 48 + ;; size=4 bbWeight=0.50 PerfScore 0.50 +G_M56207_IG11: pop rbp ret - ;; size=9 bbWeight=0.50 PerfScore 1.38 + ;; size=2 bbWeight=1 PerfScore 1.50 +G_M56207_IG12: + mov rax, 0xD1FFAB1E ; code for System.ThrowHelper:ThrowNotSupportedException() + call [rax]System.ThrowHelper:ThrowNotSupportedException() + int3 + ;; size=13 bbWeight=0 PerfScore 0.00 -; Total bytes of code 283, prolog size 10, PerfScore 84.67, instruction count 73, allocated bytes for code 283 (MethodHash=b4fe2470) for method System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts) +; Total bytes of code 177, prolog size 4, PerfScore 53.88, instruction count 44, allocated bytes for code 177 (MethodHash=b4fe2470) for method System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts) ```
-25 (-9.09 % of base) - System.SpanHelpers:Fill[short](byref,ulong,short) ```diff ; Assembly listing for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts) ; Emitting BLENDED_CODE for X64 with AVX512 - Unix ; FullOpts code ; optimized code ; rbp based frame ; fully interruptible ; No PGO data ; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data ; Final local variable assignments ; -; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def +; V00 arg0 [V00,T02] ( 18, 38 ) byref -> rdi single-def ; V01 arg1 [V01,T07] ( 10, 6.50) long -> rsi single-def -; V02 arg2 [V02,T02] ( 18, 37.75) short -> rdx single-def +; V02 arg2 [V02,T01] ( 18, 41.50) short -> rdx single-def ; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax -; V04 loc1 [V04,T12] ( 2, 0.50) short -> rdx ld-addr-op -; V05 loc2 [V05,T13] ( 5, 9.50) simd32 -> mm0 ld-addr-op -; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def -; V07 loc4 [V07,T11] ( 4, 2 ) long -> rax -; V08 loc5 [V08,T09] ( 2, 4.50) long -> rcx -; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx -;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref -; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx -;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" -; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill" -; V14 tmp2 [V14 ] ( 3, 9.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" -;* V15 tmp3 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" -; V16 tmp4 [V16,T04] ( 5, 16.25) int -> rcx "Inline stloc first use temp" -; V17 tmp5 [V17,T08] ( 2, 8.50) ushort -> rax "Inlining Arg" +; V04 loc1 [V04,T11] ( 5, 9.50) simd32 -> mm0 +; V05 loc2 [V05,T06] ( 5, 9.50) byref -> rdi single-def +; V06 loc3 [V06,T10] ( 4, 2 ) long -> rdx +; V07 loc4 [V07,T08] ( 2, 4.50) long -> rax +; V08 loc5 [V08,T03] ( 7, 21 ) long -> rcx +; V09 loc6 [V09,T09] ( 2, 4.50) long -> rcx +;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" +; V11 tmp1 [V11,T05] ( 2, 16 ) long -> rax "dup spill" +;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument" +;* V13 tmp3 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" +; V14 tmp4 [V14 ] ( 2, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" +; V15 tmp5 [V15,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp" +;* V16 tmp6 [V16 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg" +;* V17 tmp7 [V17 ] ( 0, 0 ) ushort -> zero-ref "Inlining Arg" ; ; Lcl frame size = 48 G_M24463_IG01: push rbp sub rsp, 48 lea rbp, [rsp+0x30] ;; size=10 bbWeight=1 PerfScore 1.75 G_M24463_IG02: cmp rsi, 16 jb SHORT G_M24463_IG10 - ;; NOP compensation instructions of 4 bytes. - ;; size=10 bbWeight=1 PerfScore 1.25 + ;; size=6 bbWeight=1 PerfScore 1.25 G_M24463_IG03: - movsx rdx, dx - vxorps ymm0, ymm0, ymm0 - vmovups ymmword ptr [rbp-0x30], ymm0 - movzx rax, dx - xor ecx, ecx - align [0 bytes for IG04] - ;; size=18 bbWeight=0.25 PerfScore 0.52 + xor eax, eax + align [14 bytes for IG04] + ;; size=16 bbWeight=0.25 PerfScore 0.12 G_M24463_IG04: - lea rdx, [rbp-0x30] - movsxd r8, ecx - mov word ptr [rdx+2*r8], ax - inc ecx - cmp ecx, 16 + lea rcx, [rbp-0x30] + movsxd r8, eax + mov word ptr [rcx+2*r8], dx + inc eax + cmp eax, 16 jl SHORT G_M24463_IG04 ;; size=19 bbWeight=4 PerfScore 13.00 G_M24463_IG05: vmovups ymm0, ymmword ptr [rbp-0x30] - lea rax, [rsi+rsi] - mov rcx, rax - and rcx, -64 - xor edx, edx + lea rdx, [rsi+rsi] + mov rax, rdx + and rax, -64 + xor ecx, ecx cmp rsi, 32 jb SHORT G_M24463_IG07 - align [15 bytes for IG06] - ;; size=39 bbWeight=0.50 PerfScore 3.38 + align [0 bytes for IG06] + ;; size=24 bbWeight=0.50 PerfScore 3.25 G_M24463_IG06: - vmovups ymmword ptr [rdi+rdx], ymm0 - vmovups ymmword ptr [rdi+rdx+0x20], ymm0 - add rdx, 64 - cmp rdx, rcx + vmovups ymmword ptr [rdi+rcx], ymm0 + vmovups ymmword ptr [rdi+rcx+0x20], ymm0 + add rcx, 64 + cmp rcx, rax jb SHORT G_M24463_IG06 ;; size=20 bbWeight=4 PerfScore 22.00 G_M24463_IG07: - test al, 32 + test dl, 32 je SHORT G_M24463_IG08 - vmovups ymmword ptr [rdi+rdx], ymm0 - ;; size=9 bbWeight=0.50 PerfScore 1.62 + vmovups ymmword ptr [rdi+rcx], ymm0 + ;; size=10 bbWeight=0.50 PerfScore 1.62 G_M24463_IG08: - vmovups ymmword ptr [rdi+rax-0x20], ymm0 + vmovups ymmword ptr [rdi+rdx-0x20], ymm0 ;; size=6 bbWeight=0.50 PerfScore 1.00 G_M24463_IG09: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 G_M24463_IG10: xor eax, eax cmp rsi, 8 jb SHORT G_M24463_IG12 mov rcx, rsi and rcx, -8 - align [5 bytes for IG11] - ;; size=20 bbWeight=0.50 PerfScore 1.12 + align [0 bytes for IG11] + ;; size=15 bbWeight=0.50 PerfScore 1.00 G_M24463_IG11: mov word ptr [rdi+2*rax], dx mov word ptr [rdi+2*rax+0x02], dx mov word ptr [rdi+2*rax+0x04], dx mov word ptr [rdi+2*rax+0x06], dx mov word ptr [rdi+2*rax+0x08], dx mov word ptr [rdi+2*rax+0x0A], dx mov word ptr [rdi+2*rax+0x0C], dx mov word ptr [rdi+2*rax+0x0E], dx add rax, 8 cmp rax, rcx jb SHORT G_M24463_IG11 ;; size=48 bbWeight=4 PerfScore 38.00 G_M24463_IG12: test sil, 4 je SHORT G_M24463_IG13 mov word ptr [rdi+2*rax], dx mov word ptr [rdi+2*rax+0x02], dx mov word ptr [rdi+2*rax+0x04], dx mov word ptr [rdi+2*rax+0x06], dx add rax, 4 ;; size=29 bbWeight=0.50 PerfScore 2.75 G_M24463_IG13: test sil, 2 je SHORT G_M24463_IG14 mov word ptr [rdi+2*rax], dx mov word ptr [rdi+2*rax+0x02], dx add rax, 2 ;; size=19 bbWeight=0.50 PerfScore 1.75 G_M24463_IG14: test sil, 1 je SHORT G_M24463_IG15 mov word ptr [rdi+2*rax], dx ;; size=10 bbWeight=0.50 PerfScore 1.12 G_M24463_IG15: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 -; Total bytes of code 275, prolog size 10, PerfScore 92.02, instruction count 74, allocated bytes for code 275 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts) +; Total bytes of code 250, prolog size 10, PerfScore 91.38, instruction count 70, allocated bytes for code 250 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts) ```
-15 (-6.10 % of base) - System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) ```diff ; Assembly listing for method System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts) ; Emitting BLENDED_CODE for X64 with AVX512 - Unix ; FullOpts code ; optimized code ; rbp based frame ; fully interruptible ; No PGO data ; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data ; Final local variable assignments ; -; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def -; V01 arg1 [V01,T08] ( 9, 6 ) long -> rsi single-def -; V02 arg2 [V02,T02] ( 18, 37.75) ubyte -> rdx single-def +; V00 arg0 [V00,T02] ( 18, 38 ) byref -> rdi single-def +; V01 arg1 [V01,T07] ( 9, 6 ) long -> rsi single-def +; V02 arg2 [V02,T01] ( 18, 41.50) ubyte -> rdx single-def ; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax -;* V04 loc1 [V04 ] ( 0, 0 ) ubyte -> zero-ref ld-addr-op -; V05 loc2 [V05 ] ( 5, 13 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op -; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def -; V07 loc4 [V07,T11] ( 5, 2.50) long -> rsi -; V08 loc5 [V08,T09] ( 2, 4.50) long -> rax -; V09 loc6 [V09,T03] ( 7, 21 ) long -> rcx -;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref -; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx -;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" -; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill" -;* V14 tmp2 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" -; V15 tmp3 [V15,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp" -; V16 tmp4 [V16,T07] ( 2, 8.50) ubyte -> rdx "Inlining Arg" +; V04 loc1 [V04,T11] ( 5, 9.50) simd32 -> mm0 +; V05 loc2 [V05,T06] ( 5, 9.50) byref -> rdi single-def +; V06 loc3 [V06,T10] ( 5, 2.50) long -> rsi +; V07 loc4 [V07,T08] ( 2, 4.50) long -> rdx +; V08 loc5 [V08,T03] ( 7, 21 ) long -> rax +; V09 loc6 [V09,T09] ( 2, 4.50) long -> rcx +;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" +; V11 tmp1 [V11,T05] ( 2, 16 ) long -> rax "dup spill" +;* V12 tmp2 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" +; V13 tmp3 [V13 ] ( 2, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" +; V14 tmp4 [V14,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp" ; ; Lcl frame size = 48 G_M56047_IG01: push rbp sub rsp, 48 lea rbp, [rsp+0x30] ;; size=10 bbWeight=1 PerfScore 1.75 G_M56047_IG02: cmp rsi, 32 jb SHORT G_M56047_IG10 - ;; NOP compensation instructions of 4 bytes. - ;; size=10 bbWeight=1 PerfScore 1.25 + ;; size=6 bbWeight=1 PerfScore 1.25 G_M56047_IG03: - movzx rdx, dl xor eax, eax - align [7 bytes for IG04] - ;; size=12 bbWeight=0.25 PerfScore 0.19 + align [14 bytes for IG04] + ;; size=16 bbWeight=0.25 PerfScore 0.12 G_M56047_IG04: lea rcx, [rbp-0x30] movsxd r8, eax mov byte ptr [rcx+r8], dl inc eax cmp eax, 32 jl SHORT G_M56047_IG04 ;; size=18 bbWeight=4 PerfScore 13.00 G_M56047_IG05: - mov rax, rsi - and rax, -64 - xor ecx, ecx + vmovups ymm0, ymmword ptr [rbp-0x30] + mov rdx, rsi + and rdx, -64 + xor eax, eax cmp rsi, 64 jb SHORT G_M56047_IG07 align [0 bytes for IG06] - ;; size=15 bbWeight=0.50 PerfScore 1.00 + ;; size=20 bbWeight=0.50 PerfScore 3.00 G_M56047_IG06: - vmovups ymm0, ymmword ptr [rbp-0x30] - vmovups ymmword ptr [rdi+rcx], ymm0 - vmovups ymm0, ymmword ptr [rbp-0x30] - vmovups ymmword ptr [rdi+rcx+0x20], ymm0 - add rcx, 64 - cmp rcx, rax + vmovups ymmword ptr [rdi+rax], ymm0 + vmovups ymmword ptr [rdi+rax+0x20], ymm0 + add rax, 64 + cmp rax, rdx jb SHORT G_M56047_IG06 - ;; size=30 bbWeight=4 PerfScore 54.00 + ;; size=20 bbWeight=4 PerfScore 22.00 G_M56047_IG07: test sil, 32 je SHORT G_M56047_IG08 - vmovups ymm0, ymmword ptr [rbp-0x30] - vmovups ymmword ptr [rdi+rcx], ymm0 - ;; size=16 bbWeight=0.50 PerfScore 3.62 + vmovups ymmword ptr [rdi+rax], ymm0 + ;; size=11 bbWeight=0.50 PerfScore 1.62 G_M56047_IG08: - vmovups ymm0, ymmword ptr [rbp-0x30] vmovups ymmword ptr [rdi+rsi-0x20], ymm0 - ;; size=11 bbWeight=0.50 PerfScore 3.00 + ;; size=6 bbWeight=0.50 PerfScore 1.00 G_M56047_IG09: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 G_M56047_IG10: xor eax, eax cmp rsi, 8 jb SHORT G_M56047_IG12 mov rcx, rsi and rcx, -8 align [0 bytes for IG11] ;; size=15 bbWeight=0.50 PerfScore 1.00 G_M56047_IG11: mov byte ptr [rdi+rax], dl mov byte ptr [rdi+rax+0x01], dl mov byte ptr [rdi+rax+0x02], dl mov byte ptr [rdi+rax+0x03], dl mov byte ptr [rdi+rax+0x04], dl mov byte ptr [rdi+rax+0x05], dl mov byte ptr [rdi+rax+0x06], dl mov byte ptr [rdi+rax+0x07], dl add rax, 8 cmp rax, rcx jb SHORT G_M56047_IG11 ;; size=40 bbWeight=4 PerfScore 38.00 G_M56047_IG12: test sil, 4 je SHORT G_M56047_IG13 mov byte ptr [rdi+rax], dl mov byte ptr [rdi+rax+0x01], dl mov byte ptr [rdi+rax+0x02], dl mov byte ptr [rdi+rax+0x03], dl add rax, 4 ;; size=25 bbWeight=0.50 PerfScore 2.75 G_M56047_IG13: test sil, 2 je SHORT G_M56047_IG14 mov byte ptr [rdi+rax], dl mov byte ptr [rdi+rax+0x01], dl add rax, 2 ;; size=17 bbWeight=0.50 PerfScore 1.75 G_M56047_IG14: test sil, 1 je SHORT G_M56047_IG15 mov byte ptr [rdi+rax], dl ;; size=9 bbWeight=0.50 PerfScore 1.12 G_M56047_IG15: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 -; Total bytes of code 246, prolog size 10, PerfScore 125.19, instruction count 73, allocated bytes for code 246 (MethodHash=d8592510) for method System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts) +; Total bytes of code 231, prolog size 10, PerfScore 91.12, instruction count 69, allocated bytes for code 231 (MethodHash=d8592510) for method System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts) ```
-11 (-5.91 % of base) - System.SpanHelpers:Fill[int](byref,ulong,int) ```diff ; Assembly listing for method System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts) ; Emitting BLENDED_CODE for X64 with AVX512 - Unix ; FullOpts code ; optimized code ; rbp based frame ; fully interruptible ; No PGO data ; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data ; Final local variable assignments ; ; V00 arg0 [V00,T03] ( 10, 6 ) byref -> rdi single-def -; V01 arg1 [V01,T05] ( 8, 5.50) long -> rsi single-def -; V02 arg2 [V02,T04] ( 10, 6 ) int -> rdx single-def -; V03 loc0 [V03,T06] ( 12, 6 ) long -> rax -;* V04 loc1 [V04 ] ( 0, 0 ) int -> zero-ref ld-addr-op -; V05 loc2 [V05,T10] ( 5, 9.50) simd32 -> mm0 ld-addr-op -; V06 loc3 [V06,T01] ( 5, 9.50) byref -> rdi single-def -; V07 loc4 [V07,T08] ( 4, 2 ) long -> rax -; V08 loc5 [V08,T07] ( 2, 4.50) long -> rcx -; V09 loc6 [V09,T00] ( 7, 21 ) long -> rdx -;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref -;* V11 loc8 [V11 ] ( 0, 0 ) long -> zero-ref -;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" -;* V13 tmp1 [V13 ] ( 0, 0 ) long -> zero-ref "dup spill" -; V14 tmp2 [V14 ] ( 10, 10 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" -;* V15 tmp3 [V15 ] ( 0, 0 ) simd32 -> zero-ref -;* V16 tmp4 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" -;* V17 tmp5 [V17,T09] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp" -; V18 tmp6 [V18,T02] ( 9, 9 ) int -> rdx "Inlining Arg" +; V01 arg1 [V01,T04] ( 8, 5.50) long -> rsi single-def +; V02 arg2 [V02,T01] ( 17, 9.50) int -> rdx single-def +; V03 loc0 [V03,T05] ( 12, 6 ) long -> rax +; V04 loc1 [V04,T09] ( 5, 9.50) simd32 -> mm0 +; V05 loc2 [V05,T02] ( 5, 9.50) byref -> rdi single-def +; V06 loc3 [V06,T07] ( 4, 2 ) long -> rdx +; V07 loc4 [V07,T06] ( 2, 4.50) long -> rax +; V08 loc5 [V08,T00] ( 7, 21 ) long -> rcx +;* V09 loc6 [V09 ] ( 0, 0 ) long -> zero-ref +;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" +;* V11 tmp1 [V11 ] ( 0, 0 ) long -> zero-ref "dup spill" +;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument" +;* V13 tmp3 [V13 ] ( 0, 0 ) simd32 -> zero-ref +;* V14 tmp4 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" +; V15 tmp5 [V15 ] ( 9, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" +;* V16 tmp6 [V16,T08] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp" ; ; Lcl frame size = 48 G_M11887_IG01: push rbp sub rsp, 48 lea rbp, [rsp+0x30] ;; size=10 bbWeight=1 PerfScore 1.75 G_M11887_IG02: cmp rsi, 8 jb SHORT G_M11887_IG08 ;; size=6 bbWeight=1 PerfScore 1.25 G_M11887_IG03: - vxorps ymm0, ymm0, ymm0 - vmovups ymmword ptr [rbp-0x30], ymm0 mov dword ptr [rbp-0x30], edx mov dword ptr [rbp-0x2C], edx mov dword ptr [rbp-0x28], edx mov dword ptr [rbp-0x24], edx mov dword ptr [rbp-0x20], edx mov dword ptr [rbp-0x1C], edx mov dword ptr [rbp-0x18], edx mov dword ptr [rbp-0x14], edx vmovups ymm0, ymmword ptr [rbp-0x30] - lea rax, [4*rsi] - mov rcx, rax - and rcx, -64 - xor edx, edx + lea rdx, [4*rsi] + mov rax, rdx + and rax, -64 + xor ecx, ecx cmp rsi, 16 jb SHORT G_M11887_IG05 - align [3 bytes for IG04] - ;; size=64 bbWeight=0.50 PerfScore 8.04 + align [0 bytes for IG04] + ;; size=52 bbWeight=0.50 PerfScore 7.25 G_M11887_IG04: - vmovups ymmword ptr [rdi+rdx], ymm0 - vmovups ymmword ptr [rdi+rdx+0x20], ymm0 - add rdx, 64 - cmp rdx, rcx + vmovups ymmword ptr [rdi+rcx], ymm0 + vmovups ymmword ptr [rdi+rcx+0x20], ymm0 + add rcx, 64 + cmp rcx, rax jb SHORT G_M11887_IG04 ;; size=20 bbWeight=4 PerfScore 22.00 G_M11887_IG05: - test al, 32 + test dl, 32 je SHORT G_M11887_IG06 - vmovups ymmword ptr [rdi+rdx], ymm0 - ;; size=9 bbWeight=0.50 PerfScore 1.62 + vmovups ymmword ptr [rdi+rcx], ymm0 + ;; size=10 bbWeight=0.50 PerfScore 1.62 G_M11887_IG06: - vmovups ymmword ptr [rdi+rax-0x20], ymm0 + vmovups ymmword ptr [rdi+rdx-0x20], ymm0 ;; size=6 bbWeight=0.50 PerfScore 1.00 G_M11887_IG07: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 G_M11887_IG08: xor eax, eax test sil, 4 je SHORT G_M11887_IG09 mov dword ptr [rdi+4*rax], edx mov dword ptr [rdi+4*rax+0x04], edx mov dword ptr [rdi+4*rax+0x08], edx mov dword ptr [rdi+4*rax+0x0C], edx add rax, 4 ;; size=27 bbWeight=0.50 PerfScore 2.88 G_M11887_IG09: test sil, 2 je SHORT G_M11887_IG10 mov dword ptr [rdi+4*rax], edx mov dword ptr [rdi+4*rax+0x04], edx add rax, 2 ;; size=17 bbWeight=0.50 PerfScore 1.75 G_M11887_IG10: test sil, 1 je SHORT G_M11887_IG11 mov dword ptr [rdi+4*rax], edx ;; size=9 bbWeight=0.50 PerfScore 1.12 G_M11887_IG11: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 -; Total bytes of code 186, prolog size 10, PerfScore 44.17, instruction count 56, allocated bytes for code 186 (MethodHash=2dc3d190) for method System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts) +; Total bytes of code 175, prolog size 10, PerfScore 43.38, instruction count 54, allocated bytes for code 175 (MethodHash=2dc3d190) for method System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts) ```
-4 (-1.53 % of base) - System.SpanHelpers:Fill[double](byref,ulong,double) ```diff ; Assembly listing for method System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts) ; Emitting BLENDED_CODE for X64 with AVX512 - Unix ; FullOpts code ; optimized code ; rbp based frame ; fully interruptible ; No PGO data ; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data ; Final local variable assignments ; ; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def ; V01 arg1 [V01,T05] ( 10, 6.50) long -> rsi single-def -; V02 arg2 [V02,T10] ( 18, 38 ) double -> mm0 single-def +; V02 arg2 [V02,T10] ( 21, 39.50) double -> mm0 single-def ; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax -;* V04 loc1 [V04 ] ( 0, 0 ) double -> zero-ref ld-addr-op -; V05 loc2 [V05,T11] ( 5, 9.50) simd32 -> mm0 ld-addr-op -; V06 loc3 [V06,T04] ( 5, 9.50) byref -> rdi single-def -; V07 loc4 [V07,T08] ( 4, 2 ) long -> rax -; V08 loc5 [V08,T06] ( 2, 4.50) long -> rcx -; V09 loc6 [V09,T02] ( 7, 21 ) long -> rdx -;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref -; V11 loc8 [V11,T07] ( 2, 4.50) long -> rcx -;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" -; V13 tmp1 [V13,T03] ( 2, 16 ) long -> rax "dup spill" -; V14 tmp2 [V14 ] ( 6, 6 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" -;* V15 tmp3 [V15 ] ( 0, 0 ) simd32 -> zero-ref -;* V16 tmp4 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" -;* V17 tmp5 [V17,T09] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp" -; V18 tmp6 [V18,T12] ( 5, 5 ) double -> mm0 "Inlining Arg" +; V04 loc1 [V04,T11] ( 5, 9.50) simd32 -> mm0 +; V05 loc2 [V05,T04] ( 5, 9.50) byref -> rdi single-def +; V06 loc3 [V06,T08] ( 4, 2 ) long -> rax +; V07 loc4 [V07,T06] ( 2, 4.50) long -> rcx +; V08 loc5 [V08,T02] ( 7, 21 ) long -> rdx +; V09 loc6 [V09,T07] ( 2, 4.50) long -> rcx +;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" +; V11 tmp1 [V11,T03] ( 2, 16 ) long -> rax "dup spill" +;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument" +;* V13 tmp3 [V13 ] ( 0, 0 ) simd32 -> zero-ref +;* V14 tmp4 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" +; V15 tmp5 [V15 ] ( 5, 2.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" +;* V16 tmp6 [V16,T09] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp" ; ; Lcl frame size = 48 G_M41871_IG01: push rbp sub rsp, 48 lea rbp, [rsp+0x30] ;; size=10 bbWeight=1 PerfScore 1.75 G_M41871_IG02: cmp rsi, 4 jb SHORT G_M41871_IG08 ;; size=6 bbWeight=1 PerfScore 1.25 G_M41871_IG03: - vxorps ymm1, ymm1, ymm1 - vmovups ymmword ptr [rbp-0x30], ymm1 vmovsd qword ptr [rbp-0x30], xmm0 vmovsd qword ptr [rbp-0x28], xmm0 vmovsd qword ptr [rbp-0x20], xmm0 vmovsd qword ptr [rbp-0x18], xmm0 vmovups ymm0, ymmword ptr [rbp-0x30] lea rax, [8*rsi] mov rcx, rax and rcx, -64 xor edx, edx cmp rsi, 8 jb SHORT G_M41871_IG05 align [0 bytes for IG04] - ;; size=57 bbWeight=0.50 PerfScore 5.92 + ;; size=48 bbWeight=0.50 PerfScore 5.25 G_M41871_IG04: vmovups ymmword ptr [rdi+rdx], ymm0 vmovups ymmword ptr [rdi+rdx+0x20], ymm0 add rdx, 64 cmp rdx, rcx jb SHORT G_M41871_IG04 ;; size=20 bbWeight=4 PerfScore 22.00 G_M41871_IG05: test al, 32 je SHORT G_M41871_IG06 vmovups ymmword ptr [rdi+rdx], ymm0 ;; size=9 bbWeight=0.50 PerfScore 1.62 G_M41871_IG06: vmovups ymmword ptr [rdi+rax-0x20], ymm0 ;; size=6 bbWeight=0.50 PerfScore 1.00 G_M41871_IG07: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 G_M41871_IG08: xor eax, eax cmp rsi, 8 jb SHORT G_M41871_IG10 mov rcx, rsi and rcx, -8 - align [0 bytes for IG09] - ;; size=15 bbWeight=0.50 PerfScore 1.00 + align [5 bytes for IG09] + ;; size=20 bbWeight=0.50 PerfScore 1.12 G_M41871_IG09: vmovsd qword ptr [rdi+8*rax], xmm0 vmovsd qword ptr [rdi+8*rax+0x08], xmm0 vmovsd qword ptr [rdi+8*rax+0x10], xmm0 vmovsd qword ptr [rdi+8*rax+0x18], xmm0 vmovsd qword ptr [rdi+8*rax+0x20], xmm0 vmovsd qword ptr [rdi+8*rax+0x28], xmm0 vmovsd qword ptr [rdi+8*rax+0x30], xmm0 vmovsd qword ptr [rdi+8*rax+0x38], xmm0 add rax, 8 cmp rax, rcx jb SHORT G_M41871_IG09 ;; size=56 bbWeight=4 PerfScore 70.00 G_M41871_IG10: test sil, 4 je SHORT G_M41871_IG11 vmovsd qword ptr [rdi+8*rax], xmm0 vmovsd qword ptr [rdi+8*rax+0x08], xmm0 vmovsd qword ptr [rdi+8*rax+0x10], xmm0 vmovsd qword ptr [rdi+8*rax+0x18], xmm0 add rax, 4 ;; size=33 bbWeight=0.50 PerfScore 4.75 G_M41871_IG11: test sil, 2 je SHORT G_M41871_IG12 vmovsd qword ptr [rdi+8*rax], xmm0 vmovsd qword ptr [rdi+8*rax+0x08], xmm0 add rax, 2 ;; size=21 bbWeight=0.50 PerfScore 2.75 G_M41871_IG12: test sil, 1 je SHORT G_M41871_IG13 vmovsd qword ptr [rdi+8*rax], xmm0 ;; size=11 bbWeight=0.50 PerfScore 1.62 G_M41871_IG13: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 -; Total bytes of code 262, prolog size 10, PerfScore 116.42, instruction count 68, allocated bytes for code 262 (MethodHash=eb775c70) for method System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts) +; Total bytes of code 258, prolog size 10, PerfScore 115.88, instruction count 66, allocated bytes for code 258 (MethodHash=eb775c70) for method System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts) ```
-4 (-1.57 % of base) - System.SpanHelpers:Fill[ushort](byref,ulong,ushort) ```diff ; Assembly listing for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts) ; Emitting BLENDED_CODE for X64 with AVX512 - Unix ; FullOpts code ; optimized code ; rbp based frame ; fully interruptible ; No PGO data ; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data ; Final local variable assignments ; -; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def +; V00 arg0 [V00,T02] ( 18, 38 ) byref -> rdi single-def ; V01 arg1 [V01,T07] ( 10, 6.50) long -> rsi single-def -; V02 arg2 [V02,T02] ( 18, 37.75) ushort -> rdx single-def +; V02 arg2 [V02,T01] ( 18, 41.50) ushort -> rdx single-def ; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax -; V04 loc1 [V04,T12] ( 2, 0.50) ushort -> rdx ld-addr-op -; V05 loc2 [V05,T13] ( 5, 9.50) simd32 -> mm0 ld-addr-op -; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def -; V07 loc4 [V07,T11] ( 4, 2 ) long -> rax -; V08 loc5 [V08,T09] ( 2, 4.50) long -> rcx -; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx -;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref -; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx -;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" -; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill" -; V14 tmp2 [V14 ] ( 3, 9.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" -;* V15 tmp3 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" -; V16 tmp4 [V16,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp" -; V17 tmp5 [V17,T08] ( 2, 8.50) ushort -> rdx "Inlining Arg" +; V04 loc1 [V04,T11] ( 5, 9.50) simd32 -> mm0 +; V05 loc2 [V05,T06] ( 5, 9.50) byref -> rdi single-def +; V06 loc3 [V06,T10] ( 4, 2 ) long -> rdx +; V07 loc4 [V07,T08] ( 2, 4.50) long -> rax +; V08 loc5 [V08,T03] ( 7, 21 ) long -> rcx +; V09 loc6 [V09,T09] ( 2, 4.50) long -> rcx +;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" +; V11 tmp1 [V11,T05] ( 2, 16 ) long -> rax "dup spill" +;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument" +;* V13 tmp3 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp" +; V14 tmp4 [V14 ] ( 2, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" +; V15 tmp5 [V15,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp" ; ; Lcl frame size = 48 G_M51983_IG01: push rbp sub rsp, 48 lea rbp, [rsp+0x30] ;; size=10 bbWeight=1 PerfScore 1.75 G_M51983_IG02: cmp rsi, 16 jb SHORT G_M51983_IG10 - ;; NOP compensation instructions of 4 bytes. - ;; size=10 bbWeight=1 PerfScore 1.25 + ;; size=6 bbWeight=1 PerfScore 1.25 G_M51983_IG03: - movzx rdx, dx - vxorps ymm0, ymm0, ymm0 - vmovups ymmword ptr [rbp-0x30], ymm0 xor eax, eax - align [0 bytes for IG04] - ;; size=14 bbWeight=0.25 PerfScore 0.46 + align [14 bytes for IG04] + ;; size=16 bbWeight=0.25 PerfScore 0.12 G_M51983_IG04: lea rcx, [rbp-0x30] movsxd r8, eax mov word ptr [rcx+2*r8], dx inc eax cmp eax, 16 jl SHORT G_M51983_IG04 ;; size=19 bbWeight=4 PerfScore 13.00 G_M51983_IG05: vmovups ymm0, ymmword ptr [rbp-0x30] - lea rax, [rsi+rsi] - mov rcx, rax - and rcx, -64 - xor edx, edx + lea rdx, [rsi+rsi] + mov rax, rdx + and rax, -64 + xor ecx, ecx cmp rsi, 32 jb SHORT G_M51983_IG07 - align [3 bytes for IG06] - ;; size=27 bbWeight=0.50 PerfScore 3.38 + align [0 bytes for IG06] + ;; size=24 bbWeight=0.50 PerfScore 3.25 G_M51983_IG06: - vmovups ymmword ptr [rdi+rdx], ymm0 - vmovups ymmword ptr [rdi+rdx+0x20], ymm0 - add rdx, 64 - cmp rdx, rcx + vmovups ymmword ptr [rdi+rcx], ymm0 + vmovups ymmword ptr [rdi+rcx+0x20], ymm0 + add rcx, 64 + cmp rcx, rax jb SHORT G_M51983_IG06 ;; size=20 bbWeight=4 PerfScore 22.00 G_M51983_IG07: - test al, 32 + test dl, 32 je SHORT G_M51983_IG08 - vmovups ymmword ptr [rdi+rdx], ymm0 - ;; size=9 bbWeight=0.50 PerfScore 1.62 + vmovups ymmword ptr [rdi+rcx], ymm0 + ;; size=10 bbWeight=0.50 PerfScore 1.62 G_M51983_IG08: - vmovups ymmword ptr [rdi+rax-0x20], ymm0 + vmovups ymmword ptr [rdi+rdx-0x20], ymm0 ;; size=6 bbWeight=0.50 PerfScore 1.00 G_M51983_IG09: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 G_M51983_IG10: xor eax, eax cmp rsi, 8 jb SHORT G_M51983_IG12 mov rcx, rsi and rcx, -8 align [0 bytes for IG11] ;; size=15 bbWeight=0.50 PerfScore 1.00 G_M51983_IG11: mov word ptr [rdi+2*rax], dx mov word ptr [rdi+2*rax+0x02], dx mov word ptr [rdi+2*rax+0x04], dx mov word ptr [rdi+2*rax+0x06], dx mov word ptr [rdi+2*rax+0x08], dx mov word ptr [rdi+2*rax+0x0A], dx mov word ptr [rdi+2*rax+0x0C], dx mov word ptr [rdi+2*rax+0x0E], dx add rax, 8 cmp rax, rcx jb SHORT G_M51983_IG11 ;; size=48 bbWeight=4 PerfScore 38.00 G_M51983_IG12: test sil, 4 je SHORT G_M51983_IG13 mov word ptr [rdi+2*rax], dx mov word ptr [rdi+2*rax+0x02], dx mov word ptr [rdi+2*rax+0x04], dx mov word ptr [rdi+2*rax+0x06], dx add rax, 4 ;; size=29 bbWeight=0.50 PerfScore 2.75 G_M51983_IG13: test sil, 2 je SHORT G_M51983_IG14 mov word ptr [rdi+2*rax], dx mov word ptr [rdi+2*rax+0x02], dx add rax, 2 ;; size=19 bbWeight=0.50 PerfScore 1.75 G_M51983_IG14: test sil, 1 je SHORT G_M51983_IG15 mov word ptr [rdi+2*rax], dx ;; size=10 bbWeight=0.50 PerfScore 1.12 G_M51983_IG15: vzeroupper add rsp, 48 pop rbp ret ;; size=9 bbWeight=0.50 PerfScore 1.38 -; Total bytes of code 254, prolog size 10, PerfScore 91.83, instruction count 73, allocated bytes for code 254 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts) +; Total bytes of code 250, prolog size 10, PerfScore 91.38, instruction count 70, allocated bytes for code 250 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts) ```
MihuBot commented 3 months ago

@xtqqczze