dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
15k stars 4.67k forks source link

JIT: More elaborate store forwarding patterns with physical promotion #86665

Open jakobbotsch opened 1 year ago

jakobbotsch commented 1 year ago

I noticed the following diff with physical promotion enabled:

+22 (+733.33%) : 48321.dasm - Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap:GetContextForFileStart():Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState

@@ -8,20 +8,29 @@
 ; Final local variable assignments
 ;
 ;# V00 OutArgs      [V00    ] (  1,  1   )  struct ( 0) [rsp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V01 tmp1         [V01    ] (  0,  0   )  struct ( 8) zero-ref    do-not-enreg[SF] ld-addr-op "NewObj constructor temp"
+;  V01 tmp1         [V01,T00] (  4,  8   )  struct ( 8) [rsp+00H]   do-not-enreg[SF] ld-addr-op "NewObj constructor temp"
+;* V02 tmp2         [V02    ] (  0,  0   )     int  ->  zero-ref    "V01.[000..004)"
+;* V03 tmp3         [V03,T01] (  0,  0   )   ubyte  ->  zero-ref    "V01.[004..005)"
+;* V04 tmp4         [V04,T02] (  0,  0   )   ubyte  ->  zero-ref    "V01.[005..006)"
 ;
-; Lcl frame size = 0
+; Lcl frame size = 8

 G_M10451_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
-                       ;; size=0 bbWeight=1 PerfScore 0.00
+       push     rax
+                       ;; size=1 bbWeight=1 PerfScore 1.00
 G_M10451_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
        xor      eax, eax
-                       ;; size=2 bbWeight=1 PerfScore 0.25
+       mov      dword ptr [rsp], eax
+       mov      byte  ptr [rsp+04H], 0
+       mov      byte  ptr [rsp+05H], 0
+       mov      rax, qword ptr [rsp]
+                       ;; size=19 bbWeight=1 PerfScore 4.25
 G_M10451_IG03:        ; bbWeight=1, epilog, nogc, extend
+       add      rsp, 8
        ret      
-                       ;; size=1 bbWeight=1 PerfScore 1.00
+                       ;; size=5 bbWeight=1 PerfScore 1.25

-; Total bytes of code 3, prolog size 0, PerfScore 1.55, instruction count 2, allocated bytes for code 3 (MethodHash=08d0d72c) for method Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap:GetContextForFileStart():Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState
+; Total bytes of code 25, prolog size 1, PerfScore 9.00, instruction count 8, allocated bytes for code 25 (MethodHash=08d0d72c) for method Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap:GetContextForFileStart():Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState
 ; ============================================================

This is the result of a zero init of the struct + read back + return, with the struct having two padding bytes at the end. We would need to teach the store forwarding optimization in lowering about padding to catch this (or alternatively allow RETURN(FIELD_LIST)).

Normal promotion just sees a block init with holes and DNERs it.

ghost commented 1 year ago

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch See info in area-owners.md if you want to be subscribed.

Issue Details
I noticed the following diff with physical promotion enabled: `+22 (+733.33%) : 48321.dasm - Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap:GetContextForFileStart():Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState` ```diff @@ -8,20 +8,29 @@ ; Final local variable assignments ; ;# V00 OutArgs [V00 ] ( 1, 1 ) struct ( 0) [rsp+00H] do-not-enreg[XS] addr-exposed "OutgoingArgSpace" -;* V01 tmp1 [V01 ] ( 0, 0 ) struct ( 8) zero-ref do-not-enreg[SF] ld-addr-op "NewObj constructor temp" +; V01 tmp1 [V01,T00] ( 4, 8 ) struct ( 8) [rsp+00H] do-not-enreg[SF] ld-addr-op "NewObj constructor temp" +;* V02 tmp2 [V02 ] ( 0, 0 ) int -> zero-ref "V01.[000..004)" +;* V03 tmp3 [V03,T01] ( 0, 0 ) ubyte -> zero-ref "V01.[004..005)" +;* V04 tmp4 [V04,T02] ( 0, 0 ) ubyte -> zero-ref "V01.[005..006)" ; -; Lcl frame size = 0 +; Lcl frame size = 8 G_M10451_IG01: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG - ;; size=0 bbWeight=1 PerfScore 0.00 + push rax + ;; size=1 bbWeight=1 PerfScore 1.00 G_M10451_IG02: ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref xor eax, eax - ;; size=2 bbWeight=1 PerfScore 0.25 + mov dword ptr [rsp], eax + mov byte ptr [rsp+04H], 0 + mov byte ptr [rsp+05H], 0 + mov rax, qword ptr [rsp] + ;; size=19 bbWeight=1 PerfScore 4.25 G_M10451_IG03: ; bbWeight=1, epilog, nogc, extend + add rsp, 8 ret - ;; size=1 bbWeight=1 PerfScore 1.00 + ;; size=5 bbWeight=1 PerfScore 1.25 -; Total bytes of code 3, prolog size 0, PerfScore 1.55, instruction count 2, allocated bytes for code 3 (MethodHash=08d0d72c) for method Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap:GetContextForFileStart():Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState +; Total bytes of code 25, prolog size 1, PerfScore 9.00, instruction count 8, allocated bytes for code 25 (MethodHash=08d0d72c) for method Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextStateMap:GetContextForFileStart():Microsoft.CodeAnalysis.CSharp.Syntax.NullableContextState ; ============================================================ ``` This is the result of a zero init of the struct + read back + return, with the struct having two padding bytes at the end. We would need to teach the store forwarding optimization in lowering about padding to catch this (or alternatively allow `RETURN(FIELD_LIST)`).
Author: jakobbotsch
Assignees: -
Labels: `area-CodeGen-coreclr`
Milestone: -
jakobbotsch commented 1 month ago

My long term plan here is to make FIELD_LIST a better supported node and start using it for GT_RETURN as well. That'll allow the JIT to transform to that representation for returns.

jakobbotsch commented 3 weeks ago

Once the work is done for more first-class support for FIELD_LIST we should also be able to use it to fix the TODO mentioned in https://github.com/dotnet/runtime/pull/78131#issuecomment-2313303662.