JIT doesn't properly optimize assignment to stackalloced span elements when used as 'out' parameters

GrabYourPitchforks commented 4 years ago

The JIT seems to include a bunch of unnecessary ceremony when using the pattern Span<T> = stackalloc T[const]; and assigning out parameters from functions to those elements. The codegen is especially noisy when using ValueTuple as a return type and deconstructing the values directly into a span.

See the examples below, with some assembly-level annotations inline.

using System;
using System.Runtime.CompilerServices;

[module: System.Runtime.CompilerServices.SkipLocalsInit]

public class C {
    // Deconstruct a ValueTuple's values into the span
    public static void WithDecompose(uint i) {
        Span<Char> chars = stackalloc Char[2];
        (chars[0], chars[1]) = Foo(i);
    }

    // Assign 'out' values to the span
    public static void WithOuts(uint i) {
        Span<Char> chars = stackalloc Char[2];
        Bar(i, out chars[0], out chars[1]);
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    static (char a, char b) Foo(uint i)
    {
        return ((char)i, (char)(i >> 16));
    }

    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    static void Bar(uint i, out char a, out char b)
    {
        a = (char)i;
        b = (char)(i >> 16);

    }
}

; C.WithDecompose(UInt32)
    L0000: push esi
    L0001: sub esp, 0x10
    L0004: mov dword ptr [esp+0xc], 0xa3ec260e
    L000c: lea eax, [esp]
    L000f: mov edx, 2
    L0014: cmp edx, 0
    L0017: jbe short L0067  ; this branch is guaranteed never to be taken
    L0019: lea edx, [eax+2]
    L001c: mov word ptr [esp+4], 0  ; looks like zero-initing, even with [SkipLocalsInit]?
    L0023: mov word ptr [esp+6], 0
    L002a: movzx esi, cx
    L002d: mov [esp+4], si  ; L002a can be deleted, this can be mov word ptr [esp + 4], cx
    L0032: shr ecx, 0x10
    L0035: movzx ecx, cx
    L0038: mov [esp+6], cx  ; L0035 can be deleted, this can be mov word ptr [esp + 6], cx
    L003d: mov ecx, [esp+4]  ; looks like some register shuffling going on?
    L0041: mov [esp+8], ecx
    L0045: mov ecx, [esp+8]
    L0049: mov [eax], cx
    L004c: mov eax, [esp+0xa]  ; JIT generated an unaligned 32-bit read? (assuming esp is 4-byte aligned)
    L0050: mov [edx], ax
    L0053: cmp dword ptr [esp+0xc], 0xa3ec260e
    L005b: je short L0062
    L005d: call 0x61cb2530
    L0062: add esp, 0x10
    L0065: pop esi
    L0066: ret
    L0067: call 0x61cb16d0
    L006c: int3

; C.WithOuts(UInt32)
    L0000: sub esp, 8
    L0003: mov dword ptr [esp+4], 0xa3ec260e
    L000b: lea eax, [esp]
    L000e: mov edx, 2
    L0013: cmp edx, 0
    L0016: jbe short L0037  ; this branch is guaranteed never to be taken
    L0018: lea edx, [eax+2]  ; nit: this could be folded into the mov instruction at L0021
    L001b: mov [eax], cx
    L001e: shr ecx, 0x10
    L0021: mov [edx], cx
    L0024: cmp dword ptr [esp+4], 0xa3ec260e
    L002c: je short L0033
    L002e: call 0x61cb2530
    L0033: add esp, 8
    L0036: ret
    L0037: call 0x61cb16d0
    L003c: int3

The unaligned read at L004c is particularly suspicious, so I wonder if that's sharplab incorrectly reporting a movzx instruction as a mov instruction.

category:cq theme:stack-allocation skill-level:expert cost:large

AndyAyersMS commented 4 years ago

Taking a look.

EgorBo commented 4 years ago

btw, I'd expect

Span<byte> span = stackalloc byte[8];

and

long l = 0;
Span<byte> span = new Span<byte>((byte*)&l, 8);

to have the same codegen (at least on x86_64) but currently the latter is better.

or int for stackalloc Char[2] as in the description.

AndyAyersMS commented 4 years ago

Still looking, but a few early notes:

Jit should perhaps handle IsReferenceOrContainsReferences as an intrinsic (it is marked as such since 60ec460ee but is not recognized by the jit). As things stand now we end up inlining this and realizing it returns a constant, but we don't prune the flow graph base on this until after we're done importing and inlining. So we do some extra work that we then throw away, potentially a lot of work if that check is guarding a large swath of code.
After morph we have a degenerate bounds check, oddly nothing optimizes this away:
```
fgMorphTree BB01, STMT00003 (after)
```

[000029] -A-X-+------ ASG byref [000028] D----+-N---- +-- LCL_VAR byref V02 loc1
[000027] ---X-+------ -- COMMA byref [000022] ---X-+------ +-- ARR_BOUNDS_CHECK_Rng void
[000017] -----+------ | +-- CNS_INT int 0 [000021] -----+------ | -- CNS_INT int 2 [000025] -----+------ --* LCL_VAR byref V18 tmp14

Early prop has logic to optimize this sort of thing but it's guarded by finding an actual array allocation; here the allocation is implicit.

3. I don't see the extra zero inits in latest main so maybe some of @erozenfeld's recent work has cleaned that up.
```asm
G_M36090_IG01:
       push     esi
       sub      esp, 16
       mov      dword ptr [esp+0CH], 0xD1FFAB1E
                        ;; bbWeight=1    PerfScore 2.25
G_M36090_IG02:
       lea      eax, bword ptr [esp]
       mov      edx, 2
       cmp      edx, 0
       jbe      SHORT G_M36090_IG04
       lea      edx, bword ptr [eax+2]
       movzx    esi, cx
       mov      word  ptr [esp+04H], si
       shr      ecx, 16
       movzx    ecx, cx
       mov      word  ptr [esp+06H], cx
       mov      ecx, dword ptr [esp+04H]
       mov      dword ptr [esp+08H], ecx
       mov      ecx, dword ptr [esp+08H]
       mov      word  ptr [eax], cx
       mov      eax, dword ptr [esp+0AH]
       mov      word  ptr [edx], ax
       cmp      dword ptr [esp+0CH], 0xD1FFAB1E
       je       SHORT G_M36090_IG03
       call     CORINFO_HELP_FAIL_FAST
                        ;; bbWeight=1    PerfScore 16.00
G_M36090_IG03:
       add      esp, 16
       pop      esi
       ret      
                        ;; bbWeight=1    PerfScore 1.75
G_M36090_IG04:
       call     CORINFO_HELP_RNGCHKFAIL
       int3     
                        ;; bbWeight=0    PerfScore 0.00

AndyAyersMS commented 4 years ago

@EgorBo the main difference I see in codegen for stackalloc and &local is that the former introduces a stack integrity check. Actual access to the storage seems comparable.

We have discussed considering span-wrapped stackalloc as implicitly safe but so far still consider it as an "unsafe buffer".

AndyAyersMS commented 4 years ago

Beyond the above the main issues are

the jit won't enregister small structs with fields
the jit doesn't have the ability to build a detailed model of what happens when a piece of localloc storage is cast to another type and then is consistently accessed as that type (we'd need some form of "memory SSA" or equivalent).

Because of these the jit is conservative and can't propagate values like you might hope.

Of the above the only one that might meet the bar for 5.0 is the bounds check.

Going to mark as future.

GrabYourPitchforks commented 4 years ago

@AndyAyersMS Thank you for the detailed first look! Would it make sense for me to file a separate issue re: the bounds check?

AndyAyersMS commented 4 years ago

Sure.

Note there are a few similar looking issues already, eg #10686, #32776, #1115, but would take some digging to tell if any of them are the same sort of issue.

SingleAccretion commented 3 years ago

The bounds checking part of this will be solved when (if) #49271 is merged.

The updated codegen with those changes (Windows x64):

; Method WithOuts(int)
G_M14538_IG01:
       sub      rsp, 24
       xor      eax, eax
       mov      qword ptr [rsp+08H], rax
       mov      qword ptr [rsp+10H], rax
       mov      rax, 0xD1FFAB1E
       mov      qword ptr [rsp+10H], rax
G_M14538_IG02:
       lea      rax, bword ptr [rsp+08H]
       lea      rdx, bword ptr [rax+2]
       mov      word  ptr [rax], cx
       shr      ecx, 16
       mov      word  ptr [rdx], cx
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rsp+10H], rcx
       je       SHORT G_M14538_IG03
       call     CORINFO_HELP_FAIL_FAST
G_M14538_IG03:
       nop      
G_M14538_IG04:
       add      rsp, 24
       ret

Zero-initting has come back, so we shouldn't close this just yet. It is coming from this place in cogencommon.cpp: https://github.com/dotnet/runtime/blob/877a8df6716be6a7f98e95d17007619502c43b71/src/coreclr/jit/codegencommon.cpp#L4728-L4734 It evaluates to true for both of the relevant locals:

;  V03 tmp1         [V03    ] (  1,  1   )     blk ( 8) [rsp+0x08]   do-not-enreg[X] must-init addr-exposed unsafe-buffer "stackallocLocal"
;  V13 GsCookie     [V13    ] (  1,  1   )    long  ->  [rsp+0x10]   do-not-enreg[X] must-init addr-exposed "GSSecurityCookie"

Both are not tracked and not "temps". It is unclear to me if that's by design or a bug, but at least the init for the cookie seems to be redundant.

Unrelated note on GSSecurity: us not modeling it explicitly in IR leads to calls to the helper not being merged - they will be present on every return.

jakobbotsch commented 5 months ago

Codegen today looks better and is almost the same for both (some very minor ordering differences).

Codegen for WithDecompose:

; Method C:WithDecompose(uint) (FullOpts)
G_M55791_IG01:  ;; offset=0x0000
       sub      rsp, 56
       mov      rax, 0x9ABCDEF012345678
       mov      qword ptr [rsp+0x30], rax
                        ;; size=19 bbWeight=1 PerfScore 1.50

G_M55791_IG02:  ;; offset=0x0013
       lea      rax, [rsp+0x28]
       lea      rdx, bword ptr [rax+0x02]
       movzx    r8, cx
       shr      ecx, 16
       movzx    rcx, cx
       mov      word  ptr [rax], r8w
       mov      word  ptr [rdx], cx
       mov      rcx, 0x9ABCDEF012345678
       cmp      qword ptr [rsp+0x30], rcx
       je       SHORT G_M55791_IG03
       call     CORINFO_HELP_FAIL_FAST
                        ;; size=48 bbWeight=1 PerfScore 8.25

G_M55791_IG03:  ;; offset=0x0043
       nop      
                        ;; size=1 bbWeight=1 PerfScore 0.25

G_M55791_IG04:  ;; offset=0x0044
       add      rsp, 56
       ret      
                        ;; size=5 bbWeight=1 PerfScore 1.25
; Total bytes of code: 73

Codegen for WithOuts:

; Method C:WithOuts(uint) (FullOpts)
G_M795_IG01:  ;; offset=0x0000
       sub      rsp, 56
       mov      rax, 0x9ABCDEF012345678
       mov      qword ptr [rsp+0x30], rax
                        ;; size=19 bbWeight=1 PerfScore 1.50

G_M795_IG02:  ;; offset=0x0013
       lea      rax, [rsp+0x28]
       lea      rdx, bword ptr [rax+0x02]
       mov      word  ptr [rax], cx
       shr      ecx, 16
       mov      word  ptr [rdx], cx
       mov      rcx, 0x9ABCDEF012345678
       cmp      qword ptr [rsp+0x30], rcx
       je       SHORT G_M795_IG03
       call     CORINFO_HELP_FAIL_FAST
                        ;; size=40 bbWeight=1 PerfScore 7.75

G_M795_IG03:  ;; offset=0x003B
       nop      
                        ;; size=1 bbWeight=1 PerfScore 0.25

G_M795_IG04:  ;; offset=0x003C
       add      rsp, 56
       ret      
                        ;; size=5 bbWeight=1 PerfScore 1.25
; Total bytes of code: 65

We still don't propagate some address modes that would be possible to propagate, however.

dotnet / runtime

JIT doesn't properly optimize assignment to stackalloced span elements when used as 'out' parameters #38628