Open EgorBo opened 1 year ago
Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak See info in area-owners.md if you want to be subscribed.
Author: | EgorBo |
---|---|
Assignees: | - |
Labels: | `area-CodeGen-coreclr` |
Milestone: | - |
Another code snippet for testing:
static int Test()
{
var a = new Accumulator<int>();
a.Accumulate(1)(2)(3)(4);
return a.Value;
}
public class Accumulator<T> where T : INumber<T>
{
public T Value { get; set; } = T.Zero;
public It Accumulate(T x)
{
Value += x;
return Accumulate;
}
public delegate It It(T x);
}
Codegen for Test
with Tier1 + Dynamic PGO enabled:
G_M58954_IG01:
push rdi
push rsi
push rbp
push rbx
sub rsp, 40
;; size=8 bbWeight=1 PerfScore 4.25
G_M58954_IG02:
mov rcx, 0xD1FFAB1E ; Accumulator`1[int]
call CORINFO_HELP_NEWSFAST
mov rsi, rax
xor ecx, ecx
mov dword ptr [rsi+08H], ecx
inc dword ptr [rsi+08H]
mov rdi, 0xD1FFAB1E ; Accumulator`1+It[int]
mov rcx, rdi
call CORINFO_HELP_NEWSFAST
mov rbx, rax
lea rcx, bword ptr [rbx+08H]
mov rdx, rsi
call CORINFO_HELP_ASSIGN_REF
mov rcx, 0xD1FFAB1E ; code for Accumulator`1[int]:Accumulate(int):Accumulator`1+It[int]:this
mov qword ptr [rbx+18H], rcx
cmp qword ptr [rbx+18H], rcx
jne G_M58954_IG06
;; size=83 bbWeight=1 PerfScore 14.50
G_M58954_IG03:
mov rbx, gword ptr [rbx+08H]
add dword ptr [rbx+08H], 2
mov rcx, rdi
call CORINFO_HELP_NEWSFAST
mov rbp, rax
lea rcx, bword ptr [rbp+08H]
mov rdx, rbx
call CORINFO_HELP_ASSIGN_REF
mov rcx, 0xD1FFAB1E ; code for Accumulator`1[int]:Accumulate(int):Accumulator`1+It[int]:this
mov qword ptr [rbp+18H], rcx
cmp qword ptr [rbp+18H], rcx
jne SHORT G_M58954_IG07
mov rbp, gword ptr [rbp+08H]
add dword ptr [rbp+08H], 3
mov rcx, rdi
call CORINFO_HELP_NEWSFAST
mov rbx, rax
lea rcx, bword ptr [rbx+08H]
mov rdx, rbp
call CORINFO_HELP_ASSIGN_REF
mov rcx, 0xD1FFAB1E ; code for Accumulator`1[int]:Accumulate(int):Accumulator`1+It[int]:this
mov qword ptr [rbx+18H], rcx
cmp qword ptr [rbx+18H], rcx
jne SHORT G_M58954_IG08
mov rbx, gword ptr [rbx+08H]
add dword ptr [rbx+08H], 4
mov rcx, rdi
call CORINFO_HELP_NEWSFAST
mov rdi, rax
lea rcx, bword ptr [rdi+08H]
mov rdx, rbx
call CORINFO_HELP_ASSIGN_REF
mov rax, 0xD1FFAB1E ; code for Accumulator`1[int]:Accumulate(int):Accumulator`1+It[int]:this
mov qword ptr [rdi+18H], rax
;; size=147 bbWeight=0.50 PerfScore 18.25
G_M58954_IG04:
mov eax, dword ptr [rsi+08H]
;; size=3 bbWeight=1 PerfScore 2.00
G_M58954_IG05:
add rsp, 40
pop rbx
pop rbp
pop rsi
pop rdi
ret
;; size=9 bbWeight=1 PerfScore 3.25
G_M58954_IG06:
mov rcx, gword ptr [rbx+08H]
mov edx, 2
call [rbx+18H]Accumulator`1+It[int]:Invoke(int):Accumulator`1+It[int]:this
mov rbp, rax
;; size=15 bbWeight=0 PerfScore 0.00
G_M58954_IG07:
mov rcx, gword ptr [rbp+08H]
mov edx, 3
call [rbp+18H]Accumulator`1+It[int]:Invoke(int):Accumulator`1+It[int]:this
mov rbx, rax
;; size=15 bbWeight=0 PerfScore 0.00
G_M58954_IG08:
mov rcx, gword ptr [rbx+08H]
mov edx, 4
call [rbx+18H]Accumulator`1+It[int]:Invoke(int):Accumulator`1+It[int]:this
jmp SHORT G_M58954_IG04
With the right field seqs we get the following codegen:
; Assembly listing for method Program:Test(int):int
; Emitting BLENDED_CODE for X64 with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 2 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T02] ( 3, 3 ) int -> rsi single-def
; V01 OutArgs [V01 ] ( 1, 1 ) struct (32) [rsp+00H] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V02 tmp1 [V02,T01] ( 3, 6 ) ref -> rdi class-hnd exact single-def "NewObj constructor temp"
; V03 tmp2 [V03,T00] ( 4, 8 ) ref -> rsi class-hnd exact single-def "NewObj constructor temp"
; V04 tmp3 [V04,T03] ( 2, 2 ) int -> rax "guarded devirt return temp"
;* V05 tmp4 [V05 ] ( 0, 0 ) ref -> zero-ref class-hnd "guarded devirt this exact temp"
;
; Lcl frame size = 40
G_M46009_IG01: ;; offset=0000H
push rdi
push rsi
sub rsp, 40
mov esi, ecx
;; size=8 bbWeight=1 PerfScore 2.50
G_M46009_IG02: ;; offset=0008H
mov rcx, 0x7FFF9620D088 ; Program+<>c__DisplayClass3_0
call CORINFO_HELP_NEWSFAST
mov rdi, rax
mov dword ptr [rdi+08H], esi
mov rcx, 0x7FFF9620D5E8 ; System.Func`2[int,int]
call CORINFO_HELP_NEWSFAST
mov rsi, rax
lea rcx, bword ptr [rsi+08H]
mov rdx, rdi
call CORINFO_HELP_ASSIGN_REF
mov rax, 0x7FFF96221DD0 ; code for Program+<>c__DisplayClass3_0:<Test>b__0(int):int:this
mov qword ptr [rsi+18H], rax
mov rax, gword ptr [rsi+08H]
mov eax, dword ptr [rax+08H]
add eax, 100
;; size=75 bbWeight=1 PerfScore 11.25
G_M46009_IG03: ;; offset=0053H
add rsp, 40
pop rsi
pop rdi
ret
;; size=7 bbWeight=1 PerfScore 2.25
; Total bytes of code 90, prolog size 6, PerfScore 25.00, instruction count 23, allocated bytes for code 90 (MethodHash=5b844c46) for method Program:Test(int):int
Sadly we need some more improvements to get rid of the remaining parts.
mov rax, gword ptr [rsi+08H]
) is not eliminated, even though VN is able to prove that this equals the Program+<>c__DisplayClass3_0
instance allocated earlier, and that the field a
is equal to the argument to the method. This would need some sort of support for CSE of locals to support.FWIW, codegen for the example posted by @hez2010 becomes
; Assembly listing for method Program:Test():int
; Emitting BLENDED_CODE for X64 with AVX - Windows
; Tier-1 compilation
; optimized code
; rsp based frame
; partially interruptible
; 4 inlinees with PGO data; 20 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 OutArgs [V00 ] ( 1, 1 ) struct (32) [rsp+00H] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V01 tmp1 [V01,T00] ( 6, 12 ) ref -> rsi class-hnd exact single-def "NewObj constructor temp"
; V02 tmp2 [V02,T07] ( 2, 4 ) ref -> rbp class-hnd exact single-def "spilling ret_expr"
;* V03 tmp3 [V03,T14] ( 0, 0 ) ref -> zero-ref class-hnd "spilling ret_expr"
;* V04 tmp4 [V04,T15] ( 0, 0 ) ref -> zero-ref class-hnd "spilling ret_expr"
;* V05 tmp5 [V05 ] ( 0, 0 ) ref -> zero-ref "guarded devirt return temp"
; V06 tmp6 [V06,T08] ( 3, 3 ) ref -> r14 class-hnd single-def "guarded devirt this exact temp"
;* V07 tmp7 [V07 ] ( 0, 0 ) ref -> zero-ref "guarded devirt return temp"
; V08 tmp8 [V08,T09] ( 3, 3 ) ref -> r14 class-hnd single-def "guarded devirt this exact temp"
;* V09 tmp9 [V09 ] ( 0, 0 ) ref -> zero-ref "guarded devirt return temp"
; V10 tmp10 [V10,T10] ( 3, 3 ) ref -> rbp class-hnd single-def "guarded devirt this exact temp"
; V11 tmp11 [V11,T01] ( 4, 8 ) ref -> rbp class-hnd exact single-def "NewObj constructor temp"
;* V12 tmp12 [V12 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V13 tmp13 [V13 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
; V14 tmp14 [V14,T02] ( 3, 6 ) ref -> r15 class-hnd exact single-def "NewObj constructor temp"
;* V15 tmp15 [V15 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V16 tmp16 [V16 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
; V17 tmp17 [V17,T03] ( 3, 6 ) ref -> r15 class-hnd exact single-def "NewObj constructor temp"
;* V18 tmp18 [V18 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V19 tmp19 [V19 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
; V20 tmp20 [V20,T04] ( 3, 6 ) ref -> rdi class-hnd exact single-def "NewObj constructor temp"
;* V21 tmp21 [V21 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V22 tmp22 [V22 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
; V23 cse0 [V23,T11] ( 3, 3 ) int -> rdi "CSE - moderate"
; V24 cse1 [V24,T12] ( 3, 3 ) int -> rdi "CSE - moderate"
; V25 cse2 [V25,T13] ( 3, 3 ) int -> rdi "CSE - aggressive"
; V26 cse3 [V26,T06] ( 4, 4 ) ref -> rbp "CSE - moderate"
; V27 cse4 [V27,T05] ( 5, 5 ) long -> rbx "CSE - aggressive"
;
; Lcl frame size = 40
G_M58954_IG01: ;; offset=0000H
push r15
push r14
push rdi
push rsi
push rbp
push rbx
sub rsp, 40
;; size=12 bbWeight=1 PerfScore 6.25
G_M58954_IG02: ;; offset=000CH
mov rcx, 0x7FFF964A0120 ; Program+Accumulator`1[int]
call CORINFO_HELP_NEWSFAST
mov rsi, rax
xor ecx, ecx
mov dword ptr [rsi+08H], ecx
mov edi, dword ptr [rsi+08H]
inc edi
mov dword ptr [rsi+08H], edi
mov rbx, 0x7FFF964A3408 ; Program+Accumulator`1+It[int]
mov rcx, rbx
call CORINFO_HELP_NEWSFAST
mov rbp, rax
lea rcx, bword ptr [rbp+08H]
mov rdx, rsi
call CORINFO_HELP_ASSIGN_REF
mov rcx, 0x7FFF96482520 ; code for Program+Accumulator`1[int]:Accumulate(int):Program+Accumulator`1+It[int]:this
mov qword ptr [rbp+18H], rcx
mov rbp, gword ptr [rbp+08H]
mov r14, rbp
add edi, 2
mov dword ptr [r14+08H], edi
mov rcx, rbx
call CORINFO_HELP_NEWSFAST
mov r15, rax
lea rcx, bword ptr [r15+08H]
mov rdx, r14
call CORINFO_HELP_ASSIGN_REF
mov rcx, 0x7FFF96482520 ; code for Program+Accumulator`1[int]:Accumulate(int):Program+Accumulator`1+It[int]:this
mov qword ptr [r15+18H], rcx
mov r14, rbp
add edi, 3
mov dword ptr [r14+08H], edi
mov rcx, rbx
call CORINFO_HELP_NEWSFAST
mov r15, rax
lea rcx, bword ptr [r15+08H]
mov rdx, r14
call CORINFO_HELP_ASSIGN_REF
mov rcx, 0x7FFF96482520 ; code for Program+Accumulator`1[int]:Accumulate(int):Program+Accumulator`1+It[int]:this
mov qword ptr [r15+18H], rcx
add edi, 4
mov dword ptr [rbp+08H], edi
mov rcx, rbx
call CORINFO_HELP_NEWSFAST
mov rdi, rax
lea rcx, bword ptr [rdi+08H]
mov rdx, rbp
call CORINFO_HELP_ASSIGN_REF
mov rax, 0x7FFF96482520 ; code for Program+Accumulator`1[int]:Accumulate(int):Program+Accumulator`1+It[int]:this
mov qword ptr [rdi+18H], rax
mov eax, dword ptr [rsi+08H]
;; size=222 bbWeight=1 PerfScore 32.50
G_M58954_IG03: ;; offset=00EAH
add rsp, 40
pop rbx
pop rbp
pop rsi
pop rdi
pop r14
pop r15
ret
;; size=13 bbWeight=1 PerfScore 4.25
; Total bytes of code 247, prolog size 12, PerfScore 67.70, instruction count 66, allocated bytes for code 247 (MethodHash=347c19b5) for method Program:Test():int
with the proper field seqs.
This turns out to be a bit more tricky than first expected, due to the following factors:
I'll leave this as-is for .NET 8 and try to revisit it in .NET 9.
With the right field seqs we get the following codegen:
@jakobbotsch I don't see this with main, are these field seqs from some provisional change you never PR'd? Escape analysis may be able to get rid of one of the newobjs (the other requires fieldwise analysis).
@jakobbotsch I don't see this with main, are these field seqs from some provisional change you never PR'd? Escape analysis may be able to get rid of one of the newobjs (the other requires fieldwise analysis).
Yeah, I never ended up PR'ing anything here.
With https://github.com/hez2010/runtime/tree/field-stackalloc:
; Assembly listing for method StackAllocationTest.TestClass:Test(int):int (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; Tier1 code
; optimized code
; optimized using Synthesized PGO
; rsp based frame
; partially interruptible
; with Synthesized PGO: fgCalledCount is 100
; No PGO data
; 2 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 3, 3 ) int -> rcx single-def
; V01 OutArgs [V01 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V02 tmp1 [V02 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <StackAllocationTest.TestClass+<>c__DisplayClass7_0>
;* V03 tmp2 [V03 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <System.Func`2[int,int]>
; V04 tmp3 [V04,T01] ( 3, 2 ) int -> rax "guarded devirt return temp"
; V05 tmp4 [V05,T02] ( 2, 2 ) byref -> rax class-hnd single-def "guarded devirt this exact temp" <StackAllocationTest.TestClass+<>c__DisplayClass7_0>
; V06 tmp5 [V06 ] ( 4, 4 ) struct (16) [rsp+0x58] do-not-enreg[XSF] addr-exposed "stack allocated ref class temp" <StackAllocationTest.TestClass+<>c__DisplayClass7_0>
; V07 tmp6 [V07 ] ( 6, 5 ) struct (56) [rsp+0x20] do-not-enreg[XSF] must-init addr-exposed "stack allocated ref class temp" <System.Func`2[int,int]>
; V08 rat0 [V08,T03] ( 3, 0 ) long -> rax "delegate invoke call"
;
; Lcl frame size = 104
G_M27987_IG01: ;; offset=0x0000
sub rsp, 104
vxorps xmm4, xmm4, xmm4
vmovdqu ymmword ptr [rsp+0x20], ymm4
vmovdqa xmmword ptr [rsp+0x40], xmm4
xor eax, eax
mov qword ptr [rsp+0x50], rax
;; size=27 bbWeight=1 PerfScore 5.83
G_M27987_IG02: ;; offset=0x001B
xor eax, eax
mov qword ptr [rsp+0x58], rax
mov dword ptr [rsp+0x60], eax
mov rax, 0x7FFEC317D7B8 ; StackAllocationTest.TestClass+<>c__DisplayClass7_0
mov qword ptr [rsp+0x58], rax
mov dword ptr [rsp+0x60], ecx
mov rax, 0x7FFEC317DA58 ; System.Func`2[int,int]
mov qword ptr [rsp+0x20], rax
lea rax, [rsp+0x58]
mov qword ptr [rsp+0x28], rax
mov rax, 0x7FFEC2EAA2C8 ; code for StackAllocationTest.TestClass+<>c__DisplayClass7_0:<Test>b__0(int):int:this
mov qword ptr [rsp+0x38], rax
cmp qword ptr [rsp+0x38], rax
jne SHORT G_M27987_IG04
mov rax, gword ptr [rsp+0x28]
mov eax, dword ptr [rax+0x08]
add eax, 100
;; size=88 bbWeight=1 PerfScore 14.75
G_M27987_IG03: ;; offset=0x0073
add rsp, 104
ret
;; size=5 bbWeight=1 PerfScore 1.25
G_M27987_IG04: ;; offset=0x0078
lea rax, [rsp+0x20]
mov edx, 100
mov rcx, gword ptr [rax+0x08]
call [rax+0x18]System.Func`2[int,int]:Invoke(int):int:this
jmp SHORT G_M27987_IG03
;; size=19 bbWeight=0 PerfScore 0.00
; Total bytes of code 139, prolog size 27, PerfScore 21.83, instruction count 30, allocated bytes for code 139 (MethodHash=333592ac) for method StackAllocationTest.TestClass:Test(int):int (Tier1)
And my snippet becomes:
; Assembly listing for method StackAllocationTest.TestClass:Test():int (Tier1)
; Emitting BLENDED_CODE for X64 with AVX - Windows
; Tier1 code
; optimized code
; rsp based frame
; partially interruptible
; 4 inlinees with PGO data; 20 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
; V00 OutArgs [V00 ] ( 1, 1 ) struct (32) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;* V01 tmp1 [V01 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <StackAllocationTest.TestClass+Accumulator`1[int]>
;* V02 tmp2 [V02 ] ( 0, 0 ) long -> zero-ref class-hnd exact "spilling ret_expr" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
; V03 tmp3 [V03,T04] ( 4, 1 ) byref -> rbx class-hnd exact "spilling ret_expr" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
; V04 tmp4 [V04,T05] ( 4, 1 ) byref -> rsi class-hnd exact "spilling ret_expr" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
; V05 tmp5 [V05,T07] ( 2, 0 ) byref -> rbx single-def "guarded devirt return temp"
; V06 tmp6 [V06,T01] ( 4, 2 ) byref -> rdx class-hnd single-def "guarded devirt this exact temp" <StackAllocationTest.TestClass+Accumulator`1[int]>
; V07 tmp7 [V07,T08] ( 2, 0 ) byref -> rsi single-def "guarded devirt return temp"
; V08 tmp8 [V08,T02] ( 4, 2 ) byref -> rdx class-hnd single-def "guarded devirt this exact temp" <StackAllocationTest.TestClass+Accumulator`1[int]>
;* V09 tmp9 [V09 ] ( 0, 0 ) byref -> zero-ref "guarded devirt return temp"
; V10 tmp10 [V10,T03] ( 4, 2 ) byref -> rdx class-hnd single-def "guarded devirt this exact temp" <StackAllocationTest.TestClass+Accumulator`1[int]>
;* V11 tmp11 [V11 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
;* V12 tmp12 [V12 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V13 tmp13 [V13 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V14 tmp14 [V14 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
;* V15 tmp15 [V15 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V16 tmp16 [V16 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V17 tmp17 [V17 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
;* V18 tmp18 [V18 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V19 tmp19 [V19 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V20 tmp20 [V20 ] ( 0, 0 ) long -> zero-ref class-hnd exact "NewObj constructor temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
;* V21 tmp21 [V21 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
;* V22 tmp22 [V22 ] ( 0, 0 ) int -> zero-ref "Inlining Arg"
; V23 tmp23 [V23 ] ( 7, 7 ) struct (16) [rsp+0x108] do-not-enreg[XSF] addr-exposed "stack allocated ref class temp" <StackAllocationTest.TestClass+Accumulator`1[int]>
; V24 tmp24 [V24 ] ( 6, 4.50) struct (56) [rsp+0xD0] do-not-enreg[XSF] must-init addr-exposed "stack allocated ref class temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
; V25 tmp25 [V25 ] ( 6, 3 ) struct (56) [rsp+0x98] do-not-enreg[XSF] must-init addr-exposed "stack allocated ref class temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
; V26 tmp26 [V26 ] ( 6, 3 ) struct (56) [rsp+0x60] do-not-enreg[XSF] must-init addr-exposed "stack allocated ref class temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
; V27 tmp27 [V27 ] ( 3, 1.50) struct (56) [rsp+0x28] do-not-enreg[XSF] must-init addr-exposed "stack allocated ref class temp" <StackAllocationTest.TestClass+Accumulator`1+It[int]>
; V28 cse0 [V28,T00] ( 5, 3.50) long -> rax "CSE #01: aggressive"
; V29 rat0 [V29,T06] ( 3, 0 ) long -> rax "delegate invoke call"
;
; Lcl frame size = 280
G_M62816_IG01: ;; offset=0x0000
push rsi
push rbx
sub rsp, 280
xor eax, eax
mov qword ptr [rsp+0x28], rax
vxorps xmm4, xmm4, xmm4
vmovdqa xmmword ptr [rsp+0x30], xmm4
mov rax, -192
vmovdqa xmmword ptr [rsp+rax+0x100], xmm4
vmovdqa xmmword ptr [rsp+rax+0x110], xmm4
vmovdqa xmmword ptr [rsp+rax+0x120], xmm4
add rax, 48
jne SHORT -5 instr
mov qword ptr [rsp+0x100], rax
;; size=77 bbWeight=1 PerfScore 14.33
G_M62816_IG02: ;; offset=0x004D
xor eax, eax
mov qword ptr [rsp+0x108], rax
mov dword ptr [rsp+0x110], eax
mov rax, 0x7FFEC3192B40 ; StackAllocationTest.TestClass+Accumulator`1[int]
mov qword ptr [rsp+0x108], rax
xor eax, eax
mov dword ptr [rsp+0x110], eax
mov eax, dword ptr [rsp+0x110]
inc eax
mov dword ptr [rsp+0x110], eax
mov rax, 0x7FFEC31966A8 ; StackAllocationTest.TestClass+Accumulator`1+It[int]
mov qword ptr [rsp+0xD0], rax
lea rdx, [rsp+0x108]
mov qword ptr [rsp+0xD8], rdx
mov rdx, 0x7FFEC2EAA2C8 ; code for StackAllocationTest.TestClass+Accumulator`1[int]:Accumulate(int):StackAllocationTest.TestClass+Accumulator`1+It[int]:this
mov qword ptr [rsp+0xE8], rdx
cmp qword ptr [rsp+0xE8], rdx
jne G_M62816_IG06
;; size=126 bbWeight=1 PerfScore 14.00
G_M62816_IG03: ;; offset=0x00CB
mov rdx, gword ptr [rsp+0xD8]
add dword ptr [rdx+0x08], 2
mov qword ptr [rsp+0x98], rax
mov bword ptr [rsp+0xA0], rdx
mov rdx, 0x7FFEC2EAA2C8 ; code for StackAllocationTest.TestClass+Accumulator`1[int]:Accumulate(int):StackAllocationTest.TestClass+Accumulator`1+It[int]:this
mov qword ptr [rsp+0xB0], rdx
lea rbx, bword ptr [rsp+0x98]
cmp qword ptr [rsp+0xB0], rdx
jne SHORT G_M62816_IG07
mov rdx, gword ptr [rsp+0xA0]
add dword ptr [rdx+0x08], 3
mov qword ptr [rsp+0x60], rax
mov bword ptr [rsp+0x68], rdx
mov rdx, 0x7FFEC2EAA2C8 ; code for StackAllocationTest.TestClass+Accumulator`1[int]:Accumulate(int):StackAllocationTest.TestClass+Accumulator`1+It[int]:this
mov qword ptr [rsp+0x78], rdx
lea rsi, bword ptr [rsp+0x60]
cmp qword ptr [rsp+0x78], rdx
jne SHORT G_M62816_IG08
mov rdx, gword ptr [rsp+0x68]
add dword ptr [rdx+0x08], 4
mov qword ptr [rsp+0x28], rax
mov bword ptr [rsp+0x30], rdx
mov rax, 0x7FFEC2EAA2C8 ; code for StackAllocationTest.TestClass+Accumulator`1[int]:Accumulate(int):StackAllocationTest.TestClass+Accumulator`1+It[int]:this
mov qword ptr [rsp+0x40], rax
;; size=147 bbWeight=0.50 PerfScore 14.38
G_M62816_IG04: ;; offset=0x015E
mov eax, dword ptr [rsp+0x110]
;; size=7 bbWeight=1 PerfScore 1.00
G_M62816_IG05: ;; offset=0x0165
add rsp, 280
pop rbx
pop rsi
ret
;; size=10 bbWeight=1 PerfScore 2.25
G_M62816_IG06: ;; offset=0x016F
lea rax, [rsp+0xD0]
mov edx, 2
mov rcx, gword ptr [rax+0x08]
call [rax+0x18]StackAllocationTest.TestClass+Accumulator`1+It[int]:Invoke(int):StackAllocationTest.TestClass+Accumulator`1+It[int]:this
mov rbx, rax
;; size=23 bbWeight=0 PerfScore 0.00
G_M62816_IG07: ;; offset=0x0186
mov edx, 3
mov rcx, gword ptr [rbx+0x08]
call [rbx+0x18]StackAllocationTest.TestClass+Accumulator`1+It[int]:Invoke(int):StackAllocationTest.TestClass+Accumulator`1+It[int]:this
mov rsi, rax
;; size=15 bbWeight=0 PerfScore 0.00
G_M62816_IG08: ;; offset=0x0195
mov edx, 4
mov rcx, gword ptr [rsi+0x08]
call [rsi+0x18]StackAllocationTest.TestClass+Accumulator`1+It[int]:Invoke(int):StackAllocationTest.TestClass+Accumulator`1+It[int]:this
jmp SHORT G_M62816_IG04
;; size=14 bbWeight=0 PerfScore 0.00
; Total bytes of code 419, prolog size 77, PerfScore 45.96, instruction count 74, allocated bytes for code 419 (MethodHash=5c2a0a9f) for method StackAllocationTest.TestClass:Test():int (Tier1)
All allocations are successfully removed!
Seems like this one might be nice to target for .NET 10 as an improvement to object stack allocation.
Similiar to what https://github.com/dotnet/runtime/pull/84661 does but for delegates now, e.g.:
Codegen for
Test(int a)
in Tier1 with Dynamic PGO enabled:The delegate is correctly inlined here under a guard. Since we use a closure here, the delegate is always allocated at the callsite so we always know its exact type and the exact method target so we can optimize the GDV check and remove the software fallback:
we write a value to a memory location and then compare that location with the same value. @jakobbotsch said the reason CSE didn't fold it is a missing
FieldSeq*
that we can add during creating that GDV check (might require a new JIT-EE API to get a field handle of_methodPtr
field inDelegate
).Follow up questions
1) Once we remove the check - can we also remove the unnecessary delegate instance allocation? 2) For static lambdas Roslyn currently emits a non-jit friendly pattern with a static mutable field, e.g. sharplab.io, so there is no way JIT can track the target - that can be improved on the Roslyn side - https://github.com/dotnet/csharplang/discussions/6746#discussioncomment-5587815