dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
15.27k stars 4.73k forks source link

Inlined struct copies via params, returns and assignment not elided #12219

Closed benaadams closed 1 year ago

benaadams commented 5 years ago

As seen in ValueTaskAwaiter from https://github.com/dotnet/coreclr/pull/22735, https://github.com/dotnet/coreclr/pull/22738 wasn't a specific issue for it with simple repo so opening this.

using System.Runtime.CompilerServices;

class Program
{
    static void Main(string[] args)
    {
        InlinedAssignment();
        InlinedCtor();
    }

    [MethodImpl(MethodImplOptions.NoInlining)]
    static long InlinedAssignment()
    {
        var s = CreateLargeStruct();
        s = GetLargeStruct(s);

        return s.l3;
    }

    [MethodImpl(MethodImplOptions.NoInlining)]
    static long InlinedCtor()
    {
        var s = new LargeStruct2(CreateLargeStruct());

        return s._largeStruct.l3;
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    static LargeStruct GetLargeStruct(LargeStruct l) => l;

    [MethodImpl(MethodImplOptions.NoInlining)]
    static LargeStruct CreateLargeStruct() => new LargeStruct();
}

readonly struct LargeStruct
{
    public readonly long l0;
    public readonly long l1;
    public readonly long l2;
    public readonly long l3;
}

readonly struct LargeStruct2
{
    public readonly LargeStruct _largeStruct;

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public LargeStruct2(LargeStruct largeStruct)
    {
        _largeStruct = largeStruct;
    }
}

Produces

; Assembly listing for method Program:InlinedAssignment():long
;  V00 OutArgs      [V00    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;  V01 tmp1         [V01    ] (  2,  4   )  struct (32) [rsp+0x48]   do-not-enreg[XSB] addr-exposed "struct address for call/obj"
;* V02 tmp2         [V02    ] (  0,  0   )  struct (32) zero-ref    "struct address for call/obj"
;  V03 tmp3         [V03    ] (  2,  4   )  struct (32) [rsp+0x28]   do-not-enreg[XSB] addr-exposed "Inlining Arg"
;* V04 tmp4         [V04    ] (  0,  0   )    long  ->  zero-ref    V02.l0(offs=0x00) P-INDEP "field V02.l0 (fldOffset=0x0)"
;* V05 tmp5         [V05    ] (  0,  0   )    long  ->  zero-ref    V02.l1(offs=0x08) P-INDEP "field V02.l1 (fldOffset=0x8)"
;* V06 tmp6         [V06    ] (  0,  0   )    long  ->  zero-ref    V02.l2(offs=0x10) P-INDEP "field V02.l2 (fldOffset=0x10)"
;  V07 tmp7         [V07,T01] (  2,  2   )    long  ->  rax         V02.l3(offs=0x18) P-INDEP "field V02.l3 (fldOffset=0x18)"
;  V08 tmp8         [V08,T00] (  3,  6   )   byref  ->  rax         "BlockOp address local"
; Lcl frame size = 104

G_M60975_IG01:
       4883EC68             sub      rsp, 104
       C5F877               vzeroupper 

G_M60975_IG02:
       488D4C2448           lea      rcx, bword ptr [rsp+48H]
       E8BF67FFFF           call     Program:CreateLargeStruct():struct
       C5FA6F442448         vmovdqu  xmm0, qword ptr [rsp+48H]
       C5FA7F442428         vmovdqu  qword ptr [rsp+28H], xmm0
       C5FA6F442458         vmovdqu  xmm0, qword ptr [rsp+58H]
       C5FA7F442438         vmovdqu  qword ptr [rsp+38H], xmm0
       488D442428           lea      rax, bword ptr [rsp+28H]
       488B10               mov      rdx, qword ptr [rax]
       488B4018             mov      rax, qword ptr [rax+24]

G_M60975_IG03:
       4883C468             add      rsp, 104
       C3                   ret      

; Total bytes of code 58, prolog size 7 for method Program:InlinedAssignment():long

and

; Assembly listing for method Program:InlinedCtor():long
;  V00 OutArgs      [V00    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;  V01 tmp1         [V01,T00] (  2,  4   )  struct (32) [rsp+0x68]   do-not-enreg[SFB] "NewObj constructor temp"
;  V02 tmp2         [V02    ] (  2,  4   )  struct (32) [rsp+0x48]   do-not-enreg[XSB] addr-exposed "struct address for call/obj"
;  V03 tmp3         [V03,T01] (  2,  4   )  struct (32) [rsp+0x28]   do-not-enreg[SB] "Inlining Arg"
;
; Lcl frame size = 136

G_M31928_IG01:
       4881EC88000000       sub      rsp, 136
       C5F877               vzeroupper 

G_M31928_IG02:
       488D4C2448           lea      rcx, bword ptr [rsp+48H]
       E8BCFFFFFF           call     Program:CreateLargeStruct():struct
       C5FA6F442448         vmovdqu  xmm0, qword ptr [rsp+48H]
       C5FA7F442428         vmovdqu  qword ptr [rsp+28H], xmm0
       C5FA6F442458         vmovdqu  xmm0, qword ptr [rsp+58H]
       C5FA7F442438         vmovdqu  qword ptr [rsp+38H], xmm0
       C5FA6F442428         vmovdqu  xmm0, qword ptr [rsp+28H]
       C5FA7F442468         vmovdqu  qword ptr [rsp+68H], xmm0
       C5FA6F442438         vmovdqu  xmm0, qword ptr [rsp+38H]
       C5FA7F442478         vmovdqu  qword ptr [rsp+78H], xmm0
       488B842480000000     mov      rax, qword ptr [rsp+80H]

G_M31928_IG03:
       4881C488000000       add      rsp, 136
       C3                   ret      

; Total bytes of code 84, prolog size 10 for method Program:InlinedCtor():long

Some of these copies could be skipped?

/cc @AndyAyersMS @mikedn @stephentoub @jkotas

category:cq theme:structs skill-level:expert cost:large

benaadams commented 5 years ago

You can Unsafe.As and in around some of these but it isn't ideal https://github.com/dotnet/coreclr/pull/22735#issue-254861650

mikedn commented 5 years ago

For something like var s = CreateLargeStruct(); the struct is returned by passing a hidden reference argument to CreateLargeStruct. That leaves local variable s address exposed and that usually kills all optimizations around it.

benaadams commented 5 years ago

Not using the same variable and passing directly produces same output

[MethodImpl(MethodImplOptions.NoInlining)]
static long InlinedAssignment()
{
    var s = GetLargeStruct(CreateLargeStruct());

    return s.l3;
}
; Assembly listing for method Program:InlinedAssignment():long
;
;  V00 OutArgs      [V00    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;  V01 tmp1         [V01    ] (  2,  4   )  struct (32) [rsp+0x48]   do-not-enreg[XSB] addr-exposed "struct address for call/obj"
;* V02 tmp2         [V02    ] (  0,  0   )  struct (32) zero-ref    "struct address for call/obj"
;  V03 tmp3         [V03    ] (  2,  4   )  struct (32) [rsp+0x28]   do-not-enreg[XSB] addr-exposed "Inlining Arg"
;* V04 tmp4         [V04    ] (  0,  0   )    long  ->  zero-ref    V02.l0(offs=0x00) P-INDEP "field V02.l0 (fldOffset=0x0)"
;* V05 tmp5         [V05    ] (  0,  0   )    long  ->  zero-ref    V02.l1(offs=0x08) P-INDEP "field V02.l1 (fldOffset=0x8)"
;* V06 tmp6         [V06    ] (  0,  0   )    long  ->  zero-ref    V02.l2(offs=0x10) P-INDEP "field V02.l2 (fldOffset=0x10)"
;  V07 tmp7         [V07,T01] (  2,  2   )    long  ->  rax         V02.l3(offs=0x18) P-INDEP "field V02.l3 (fldOffset=0x18)"
;  V08 tmp8         [V08,T00] (  3,  6   )   byref  ->  rax         "BlockOp address local"
;
; Lcl frame size = 104

G_M60975_IG01:
       4883EC68             sub      rsp, 104
       C5F877               vzeroupper 

G_M60975_IG02:
       488D4C2448           lea      rcx, bword ptr [rsp+48H]
       E8BF67FFFF           call     Program:CreateLargeStruct():struct
       C5FA6F442448         vmovdqu  xmm0, qword ptr [rsp+48H]
       C5FA7F442428         vmovdqu  qword ptr [rsp+28H], xmm0
       C5FA6F442458         vmovdqu  xmm0, qword ptr [rsp+58H]
       C5FA7F442438         vmovdqu  qword ptr [rsp+38H], xmm0
       488D442428           lea      rax, bword ptr [rsp+28H]
       488B10               mov      rdx, qword ptr [rax]
       488B4018             mov      rax, qword ptr [rax+24]

G_M60975_IG03:
       4883C468             add      rsp, 104
       C3                   ret      

; Total bytes of code 58, prolog size 7 for method Program:InlinedAssignment():long

Though the ValueTaskAwaiter matches the InlinedCtor more

AndyAyersMS commented 5 years ago

There are some correctness concerns related to exception semantics and self-modification that require staging some struct operations through temps. The copies (and temps) can sometimes be optimized away later after global (method-wide) analysis. The jit does not do this analysis yet, but hopefully will soon -- see the plans for first-class structs.

cc @CarolEidt

mikedn commented 5 years ago

Not using the same variable and passing directly produces same output

That code is practically the same as the original code. There is still a temporary variable, even if you don't declare one in the C# code.

The address of that temp variable is passed to CreateLargeStruct and that makes it address exposed (well, I actually assume that, I see that it is address exposed in the dump but I didn't check the reason).

Then the ABI semantics require a copy of that temp variable to be passed to GetLargeStruct. Inlining can't change semantics, the copy is still needed. Later the optimizer could see that perhaps the copy isn't really needed. But it can't because that temporary variable is address exposed.

And it looks like the copy of temp variable is too address exposed. Not sure why, looks like bad handling of promoted struct in fgMorphCopyBlock, I think I've seen that before.

I think the big question here is if the a local variable that's used as return buffer actually needs to be address exposed.

mikedn commented 5 years ago

Here's an IR dump:

LocalAddressVisitor visiting statement:
               [000006] ------------              *  STMT      void  (IL 0x000...0x00F)
               [000001] S-C-G-------              \--*  CALL      void   Program.CreateLargeStruct
               [000004] ------------ arg0            \--*  ADDR      byref 
               [000003] ------------                    \--*  LCL_VAR   struct V01 tmp1         

Local V01 should not be enregistered because: it is address exposed

LocalAddressVisitor visiting statement:
               [000030] ------------              *  STMT      void  (IL   ???...  ???)
               [000009] n-----------              |  /--*  OBJ(32)   struct
               [000008] ------------              |  |  \--*  ADDR      byref 
               [000007] ------------              |  |     \--*  LCL_VAR   struct(AX) V01 tmp1         
               [000029] -A----------              \--*  ASG       struct (copy)
               [000027] D-----------                 \--*  LCL_VAR   struct V03 tmp3         

LocalAddressVisitor visiting statement:
               [000016] ------------              *  STMT      void  (IL   ???...  ???)
               [000023] ------------              |  /--*  LCL_VAR   struct V03 tmp3         
               [000026] -A----------              \--*  ASG       struct (copy)
               [000025] D-----------                 \--*  LCL_VAR   struct(P) V02 tmp2         
                                                     \--*    long   V02.l0 (offs=0x00) -> V04 tmp4         
                                                     \--*    long   V02.l1 (offs=0x08) -> V05 tmp5         
                                                     \--*    long   V02.l2 (offs=0x10) -> V06 tmp6         
                                                     \--*    long   V02.l3 (offs=0x18) -> V07 tmp7         

LocalAddressVisitor visiting statement:
               [000021] ------------              *  STMT      void  (IL   ???...  ???)
               [000020] ------------              \--*  RETURN    long  
               [000019] ------------                 \--*  FIELD     long   l3
               [000018] ------------                    \--*  ADDR      byref 
               [000017] ------------                       \--*  LCL_VAR   struct(P) V02 tmp2         
                                                           \--*    long   V02.l0 (offs=0x00) -> V04 tmp4         
                                                           \--*    long   V02.l1 (offs=0x08) -> V05 tmp5         
                                                           \--*    long   V02.l2 (offs=0x10) -> V06 tmp6         
                                                           \--*    long   V02.l3 (offs=0x18) -> V07 tmp7         
Replacing the field in promoted struct with local var V07
LocalAddressVisitor modified statement:
               [000021] ------------              *  STMT      void  (IL   ???...  ???)
               [000020] ------------              \--*  RETURN    long  
               [000019] ------------                 \--*  LCL_VAR   long   V07 tmp7         

LocalAddressVisitor marks V01 address exposed because its address is passed to CreateLargeStruct. The copy you see in assembly code comes from the second statement. At this point V03 is not yet address exposed, that happens later when the following statement is morphed. From there that copy is doomed to remain there forever, nothing can't touch it.

mikedn commented 5 years ago

Yeah, I have a change that prevents V03 from becoming address exposed but that has little impact:

       488D4C2448           lea      rcx, bword ptr [rsp+48H]
       E83F68FFFF           call     Program:CreateLargeStruct():struct
       C5FA6F442448         vmovdqu  xmm0, qword ptr [rsp+48H]
       C5FA7F442428         vmovdqu  qword ptr [rsp+28H], xmm0
       C5FA6F442458         vmovdqu  xmm0, qword ptr [rsp+58H]
       C5FA7F442438         vmovdqu  qword ptr [rsp+38H], xmm0
       488B442440           mov      rax, qword ptr [rsp+40H]

Basically it eliminates the useless lea and mov rdx, at the end...

AndyAyersMS commented 5 years ago

I recall thinking local assertion prop in morph could eliminate some of these struct copy chains (which is what lead me to that recent fix for DefinesLocal) as it seems to be willing to do struct copy prop.

mikedn commented 5 years ago

AFAIK assertionprop does not touch address exposed lclvars. For example: https://github.com/dotnet/coreclr/blob/2a77dc515bccf78308232384522017e5a3bdc3f0/src/jit/assertionprop.cpp#L1001-L1005

AndyAyersMS commented 5 years ago

I was looking at a similar example when I ran across dotnet/runtime#12086.

using System;

struct S
{
    public string s;
    public int a;
    public int b;
    public int c;
    public int d;

    public S(string x)
    {
        s = x;
        a = 0;
        b = 0;
        c = 0;
        d = 0;
    }
}

class C
{
    static S F(S s) => s;

    static void P(S s) => Console.WriteLine(s.s);

    public static void Main()
    {
        P(F(new S("hello, world!")));
    }
}

We end up pre-morph with a chain of 3 struct copies:

***** BB01, stmt 7
[000070] ------------              *  STMT      void  (IL 0x00A...  ???)
[000014] n-----------              |  /--*  OBJ(24)   struct
[000013] ------------              |  |  \--*  ADDR      byref 
[000011] ------------              |  |     \--*  LCL_VAR   struct V01 tmp1         
[000069] -A----------              \--*  ASG       struct (copy)
[000067] D-----------                 \--*  LCL_VAR   struct V03 tmp3         

***** BB01, stmt 8
[000022] ------------              *  STMT      void  (IL   ???...  ???)
[000063] ------------              |  /--*  LCL_VAR   struct V03 tmp3         
[000066] -A----------              \--*  ASG       struct (copy)
[000065] D-----------                 \--*  LCL_VAR   struct V02 tmp2         

***** BB01, stmt 9
[000082] ------------              *  STMT      void  (IL   ???...  ???)
[000025] n-----------              |  /--*  OBJ(24)   struct
[000024] ------------              |  |  \--*  ADDR      byref 
[000023] ------------              |  |     \--*  LCL_VAR   struct V02 tmp2         
[000081] -A----------              \--*  ASG       struct (copy)
[000079] D-----------                 \--*  LCL_VAR   struct V04 tmp4         

and local prop knows they are all equivalent:

In BB01 New Local Copy     Assertion: V03 == V01 index=#01, mask=0000000000000001
In BB01 New Local Copy     Assertion: V02 == V01 index=#02, mask=0000000000000002
In BB01 New Local Copy     Assertion: V04 == V02 index=#03, mask=0000000000000004

but for some reason does not rewrite the last assignment, so we end up with

[000070] ------------              *  STMT      void  (IL 0x00A...  ???)
[000011] -------N----              |  /--*  LCL_VAR   struct V01 tmp1         
[000069] -A------R---              \--*  ASG       struct (copy)
[000067] D------N----                 \--*  LCL_VAR   struct V03 tmp3         

***** BB01, stmt 8
[000022] ------------              *  STMT      void  (IL   ???...  ???)
[000063] ------------              |  /--*  LCL_VAR   struct V01 tmp1         
[000066] -A------R---              \--*  ASG       struct (copy)
[000065] D------N----                 \--*  LCL_VAR   struct V02 tmp2         

***** BB01, stmt 9
[000082] ------------              *  STMT      void  (IL   ???...  ???)
[000023] -------N----              |  /--*  LCL_VAR   struct V02 tmp2         
[000081] -A------R---              \--*  ASG       struct (copy)
[000079] D------N----                 \--*  LCL_VAR   struct V04 tmp4         

***** BB01, stmt 10
[000078] ------------              *  STMT      void  (IL   ???...  ???)
[000076] --C-G-------              \--*  CALL      void   System.Console.WriteLine
[000075] ------------ arg0            \--*  FIELD     ref    s
[000074] ------------                    \--*  ADDR      byref 
[000073] ------------                       \--*  LCL_VAR   struct V04 tmp4         

***** BB01, stmt 11
[000029] ------------              *  STMT      void  (IL 0x014...  ???)
[000028] ------------              \--*  RETURN    void  

Nothing here is address exposed but some structs are address taken.

The first copy is now dead and gets removed, but not the second one, for some reason. We should be able to swap tmp1 in for tmp2 in [000023].

And we should be able to swap tmp1 (or even tmp2) in for tmp4 in at [000073] but don't... that would actually eliminate all the copies in this example.

I opened dotnet/runtime#12098 as part of this, but never really got to the bottom of things.

AndyAyersMS commented 5 years ago

Ah, it is coming back to me now -- by the time optAssertionProp_LclVar gets a crack at [000023] it is marked as do not cse. So we'd need to catch it higher up in the tree, but optAssertionProp_Ind, when handed [00025] doesn't do anything.

AndyAyersMS commented 5 years ago

Have a prototype that shows some promise on my example, but not Ben's (yet): GeneralizeIndAssertionProp.

This completely flattens my test case; all struct ops are removed and we just print the string:

; Assembly listing for method C:Main()
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; Final local variable assignments
;
;  V00 OutArgs      [V00    ] (  1,  1   )  lclBlk (32) [rsp+0x00]   "OutgoingArgSpace"
;* V01 tmp1         [V01,T00] (  0,  0   )  struct (24) zero-ref    do-not-enreg[SFB] "NewObj constructor temp"
;* V02 tmp2         [V02    ] (  0,  0   )  struct (24) zero-ref    do-not-enreg[SB] "struct address for call/obj"
;* V03 tmp3         [V03    ] (  0,  0   )  struct (24) zero-ref    do-not-enreg[SB] "Inlining Arg"
;* V04 tmp4         [V04    ] (  0,  0   )  struct (24) zero-ref    do-not-enreg[SB] "Inlining Arg"
;  V05 cse0         [V05,T01] (  2,  2   )     ref  ->  rcx         "ValNumCSE"
;
; Lcl frame size = 40

G_M31058_IG01:
       sub      rsp, 40
       nop      

G_M31058_IG02:
       mov      rcx, 0xD1FFAB1E
       mov      rcx, gword ptr [rcx]
       call     Console:WriteLine(ref)
       nop      

G_M31058_IG03:
       add      rsp, 40
       ret      

more broadly it seems to have modest impact:

PMI Diffs for System.Private.CoreLib.dll, framework assemblies for x64 default jit
Summary:
(Lower is better)
Total bytes of diff: -23154 (-0.06% of base)
    diff is an improvement.
Top file regressions by size (bytes):
          13 : System.IO.Pipes.dasm (0.03% of base)
           9 : System.Net.WebSockets.dasm (0.02% of base)
Top file improvements by size (bytes):
       -6780 : Microsoft.CodeAnalysis.VisualBasic.dasm (-0.12% of base)
       -6438 : Microsoft.Diagnostics.Tracing.TraceEvent.dasm (-0.22% of base)
       -4908 : Microsoft.CodeAnalysis.CSharp.dasm (-0.11% of base)
       -1059 : Microsoft.CodeAnalysis.dasm (-0.07% of base)
        -682 : System.Private.Xml.dasm (-0.02% of base)
24 total files with size differences (22 improved, 2 regressed), 105 unchanged.
Top method regressions by size (bytes):
         148 ( 3.34% of base) : System.Security.Cryptography.Algorithms.dasm - KeyFormatHelper:ReadEncryptedPkcs8(ref,struct,struct,ref,byref,byref) (20 methods)
          76 ( 6.70% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - SourceMemberContainerTypeSymbol:AddEventAndAccessors(ref,ref,ref):this
          72 ( 8.85% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - SourceMemberContainerTypeSymbol:AddPropertyAndAccessors(ref,ref,ref):this
          64 ( 4.06% of base) : System.Private.CoreLib.dasm - ReadOnlyMemory`1:Equals(ref):bool:this (5 methods)
          49 ( 3.09% of base) : Microsoft.CodeAnalysis.dasm - ErrorLogger:GetIssueValue(struct):ref:this
Top method improvements by size (bytes):
        -520 (-17.34% of base) : Microsoft.CodeAnalysis.CSharp.dasm - ImportChain:TranslateImports(ref,ref):struct:this
        -508 (-2.91% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - Binder:ReportOverloadResolutionFailureForASingleCandidate(ref,ref,int,byref,struct,struct,bool,bool,bool,bool,ref,ref,bool,ref,ref):this
        -460 (-17.32% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - NamespaceScopeBuilder:BuildNamespaceScope(ref,ref,ref,struct,ref):struct
        -383 (-2.53% of base) : System.Net.Http.dasm - <SendAsyncCore>d__53:MoveNext():this
        -360 (-3.12% of base) : System.Security.Cryptography.X509Certificates.dasm - AsnWriter:WriteGeneralizedTimeCore(struct,struct,bool):this
Top method regressions by size (percentage):
          36 (14.88% of base) : Microsoft.CSharp.dasm - TypeTable:LookupAggregate(ref,ref,ref):ref
          72 ( 8.85% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - SourceMemberContainerTypeSymbol:AddPropertyAndAccessors(ref,ref,ref):this
          14 ( 8.81% of base) : Microsoft.CodeAnalysis.CSharp.dasm - StackOptimizerPass1:PushEvalStack(ref,int):this
          14 ( 8.81% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - Analyzer:PushEvalStack(ref,int):this
          76 ( 6.70% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - SourceMemberContainerTypeSymbol:AddEventAndAccessors(ref,ref,ref):this
Top method improvements by size (percentage):
         -78 (-30.23% of base) : System.Data.Common.dasm - SqlString:NotEquals(struct,struct):struct
        -215 (-26.84% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - SyntaxFactory:Token(struct,ushort,struct,ref):struct (2 methods)
        -215 (-24.66% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - SyntaxFactory:Identifier(struct,ref,struct):struct (2 methods)
         -68 (-24.03% of base) : Microsoft.CodeAnalysis.dasm - DebugSourceDocument:.ctor(ref,struct,struct,struct):this
        -212 (-22.46% of base) : Microsoft.CodeAnalysis.VisualBasic.dasm - SyntaxFactory:Identifier(struct,ref,bool,ref,int,struct):struct (2 methods)
577 total methods with size differences (546 improved, 31 regressed), 192836 unchanged.

Perhaps something like this in conjunction with less liberal use of address exposure might be interesting.

Also looks like SqlString:NotEquals(struct,struct):struct might be an interesting case study.

kunalspathak commented 3 years ago

Current code generated for windows/x64:

; InlinedAssignment():long

G_M927_IG01:
       sub      rsp, 104
       vzeroupper
                                                ;; bbWeight=1    PerfScore 1.25
G_M927_IG02:
       lea      rcx, [rsp+48H]
       call     MiniBench.Issue_12219:CreateLargeStruct():MiniBench.LargeStruct
       vmovdqu  xmm0, xmmword ptr [rsp+48H]
       vmovdqu  xmmword ptr [rsp+28H], xmm0
       vmovdqu  xmm0, xmmword ptr [rsp+58H]
       vmovdqu  xmmword ptr [rsp+38H], xmm0
       mov      rax, qword ptr [rsp+40H]
                                                ;; bbWeight=1    PerfScore 10.50
G_M927_IG03:
       add      rsp, 104
       ret
                                                ;; bbWeight=1    PerfScore 1.25
; InlinedCtor():long

G_M47270_IG01:
       sub      rsp, 136
       vzeroupper
                                                ;; bbWeight=1    PerfScore 1.25
G_M47270_IG02:
       lea      rcx, [rsp+48H]
       call     MiniBench.Issue_12219:CreateLargeStruct():MiniBench.LargeStruct
       vmovdqu  xmm0, xmmword ptr [rsp+48H]
       vmovdqu  xmmword ptr [rsp+28H], xmm0
       vmovdqu  xmm0, xmmword ptr [rsp+58H]
       vmovdqu  xmmword ptr [rsp+38H], xmm0
       vmovdqu  xmm0, xmmword ptr [rsp+28H]
       vmovdqu  xmmword ptr [rsp+68H], xmm0
       vmovdqu  xmm0, xmmword ptr [rsp+38H]
       vmovdqu  xmmword ptr [rsp+78H], xmm0
       mov      rax, qword ptr [rsp+80H]
                                                ;; bbWeight=1    PerfScore 18.50
G_M47270_IG03:
       add      rsp, 136
       ret

Current code generated for windows/arm64:

; InlinedAssignment():long

G_M927_IG01:
            stp     fp, lr, [sp,#-80]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M927_IG02:
            add     x8, fp, #48 // [V01 tmp1]
            bl      MiniBench.Issue_12219:CreateLargeStruct():MiniBench.LargeStruct
            ldp     x0, x1, [fp,#48]    // [V01 tmp1]
            stp     x0, x1, [fp,#16]    // [V03 tmp3]
            ldp     x0, x1, [fp,#64]    // [V01 tmp1+0x10]
            stp     x0, x1, [fp,#32]    // [V03 tmp3+0x10]
            ldr     x0, [fp,#40]        // [V03 tmp3+0x18]
                                                ;; bbWeight=1    PerfScore 11.50
G_M927_IG03:
            ldp     fp, lr, [sp],#80
            ret     lr
; InlinedCtor():long

G_M47270_IG01:
            stp     fp, lr, [sp,#-112]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M47270_IG02:
            add     x8, fp, #48 // [V02 tmp2]
            bl      MiniBench.Issue_12219:CreateLargeStruct():MiniBench.LargeStruct
            ldp     x0, x1, [fp,#48]    // [V02 tmp2]
            stp     x0, x1, [fp,#16]    // [V03 tmp3]
            ldp     x0, x1, [fp,#64]    // [V02 tmp2+0x10]
            stp     x0, x1, [fp,#32]    // [V03 tmp3+0x10]
            ldp     x0, x1, [fp,#16]    // [V03 tmp3]
            stp     x0, x1, [fp,#80]    // [V01 tmp1]
            ldp     x0, x1, [fp,#32]    // [V03 tmp3+0x10]
            stp     x0, x1, [fp,#96]    // [V01 tmp1+0x10]
            ldr     x0, [fp,#104]       // [V01 tmp1+0x18]
                                                ;; bbWeight=1    PerfScore 19.50
G_M47270_IG03:
            ldp     fp, lr, [sp],#112
            ret     lr

Adding the in keyword improves the code.

tannergooding commented 3 years ago

Adding the in keyword improves the code.

Could you clarify if this is on Windows or Unix? The ABIs, particularly around SIMD types here isn't the same and so there are scenarios where in will likely hurt other perf or where passing by value is expected. So we'd still want to optimize the byvalue case (particularly given the framework design guideline recommendations around using in, which is basically "don't").

kunalspathak commented 3 years ago

This is on windows.

kunalspathak commented 2 years ago

Also looks like SqlString:NotEquals(struct,struct):struct might be an interesting case study.

Here is a repro:

    [MethodImpl(MethodImplOptions.NoInlining)]
    public static bool Compare(SqlString s1, SqlString s2)
    {
        return MyCompare(s1, s2);
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool MyCompare(SqlString s1, SqlString s2)
    {
        return MyCompare2(s1, s2);
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static bool MyCompare2(SqlString s1, SqlString s2)
    {
        return s1.IsNull == s2.IsNull;
    }
; Assembly listing for method C:Compare(System.Data.SqlTypes.SqlString,System.Data.SqlTypes.SqlString):bool
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; No PGO data
; 0 inlinees with PGO data; 4 single block inlinees; 0 inlinees without PGO data
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  6   )   byref  ->  rcx         single-def
;  V01 arg1         [V01,T01] (  3,  6   )   byref  ->  rdx         single-def
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+00H]   "OutgoingArgSpace"
;  V03 tmp1         [V03,T02] (  2,  4   )  struct (32) [rsp+68H]   do-not-enreg[S] single-def "Inlining Arg"
;  V04 tmp2         [V04,T03] (  2,  4   )  struct (32) [rsp+48H]   do-not-enreg[S] single-def "Inlining Arg"
;  V05 tmp3         [V05,T04] (  2,  4   )  struct (32) [rsp+28H]   do-not-enreg[SF] ld-addr-op single-def "Inlining Arg"
;  V06 tmp4         [V06,T05] (  2,  4   )  struct (32) [rsp+08H]   do-not-enreg[SF] ld-addr-op single-def "Inlining Arg"
;  V07 tmp5         [V07,T06] (  2,  4   )     int  ->  rax         "impAppendStmt"
;
; Lcl frame size = 136

G_M49149_IG01:
       sub      rsp, 136
       vzeroupper 
                        ;; bbWeight=1    PerfScore 1.25
G_M49149_IG02:
       vmovdqu  ymm0, ymmword ptr[rcx]
       vmovdqu  ymmword ptr[rsp+68H], ymm0
                        ;; bbWeight=1    PerfScore 6.00
G_M49149_IG03:
       vmovdqu  ymm0, ymmword ptr[rdx]
       vmovdqu  ymmword ptr[rsp+48H], ymm0
                        ;; bbWeight=1    PerfScore 6.00
G_M49149_IG04:
       vmovdqu  ymm0, ymmword ptr[rsp+68H]
       vmovdqu  ymmword ptr[rsp+28H], ymm0
                        ;; bbWeight=1    PerfScore 5.00
G_M49149_IG05:
       vmovdqu  ymm0, ymmword ptr[rsp+48H]
       vmovdqu  ymmword ptr[rsp+08H], ymm0
                        ;; bbWeight=1    PerfScore 5.00
G_M49149_IG06:
       cmp      byte  ptr [rsp+40H], 0
       sete     al
       movzx    rax, al
       cmp      byte  ptr [rsp+20H], 0
       sete     dl
       movzx    rdx, dl
       cmp      eax, edx
       sete     al
       movzx    rax, al
                        ;; bbWeight=1    PerfScore 8.00
G_M49149_IG07:
       vzeroupper 
       add      rsp, 136
       ret    
jakobbotsch commented 1 year ago

I believe this was fixed by #64130. Codegen on main is:

; Method Program:InlinedAssignment():long
G_M53179_IG01:
       sub      rsp, 72
                        ;; size=4 bbWeight=1    PerfScore 0.25

G_M53179_IG02:
       lea      rcx, [rsp+28H]
       call     [Program:CreateLargeStruct():LargeStruct]
       mov      rax, qword ptr [rsp+40H]
                        ;; size=16 bbWeight=1    PerfScore 4.50

G_M53179_IG03:
       add      rsp, 72
       ret      
                        ;; size=5 bbWeight=1    PerfScore 1.25
; Total bytes of code: 25
; Method Program:InlinedCtor():long
G_M38338_IG01:
       sub      rsp, 72
                        ;; size=4 bbWeight=1    PerfScore 0.25

G_M38338_IG02:
       lea      rcx, [rsp+28H]
       call     [Program:CreateLargeStruct():LargeStruct]
       mov      rax, qword ptr [rsp+40H]
                        ;; size=16 bbWeight=1    PerfScore 4.50

G_M38338_IG03:
       add      rsp, 72
       ret      
                        ;; size=5 bbWeight=1    PerfScore 1.25
; Total bytes of code: 25