dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
15.09k stars 4.7k forks source link

Auto-vectorization of pointer indirection expr. and explicit layout #64026

Open am11 opened 2 years ago

am11 commented 2 years ago

Consider two forms of writing fabs() math function in C:

#include <inttypes.h>

double fabs_pointer(double v)
{
    uint64_t u = *(uint64_t*)&v & 0x7FFFFFFFFFFFFFFF;
    return *(double*)&u;
}

double fabs_union(double v)
{
    union { double f; uint64_t i; } u = { v };
    u.i &= -1ULL / 2;
    return u.f;
}

this code modifies raw bits of double input in-place and returns the modified value.

Clang with -O2 -march=ivybridge gives identical codegen:

.LCPI0_0:
        .quad   0x7fffffffffffffff              # double NaN
        .quad   0x7fffffffffffffff              # double NaN
fabs_pointer:                          # @fabs_pointer
        vandps  xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        ret
.LCPI1_0:
        .quad   0x7fffffffffffffff              # double NaN
        .quad   0x7fffffffffffffff              # double NaN
fabs_union:                             # @fabs_union
        vandps  xmm0, xmm0, xmmword ptr [rip + .LCPI1_0]
        ret

We can write the same code in C# in four different forms:

using System;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

public class C
{
    public static double fabs(double v)
    {
        ulong u = BitConverter.DoubleToUInt64Bits(v);
        return BitConverter.UInt64BitsToDouble(u & 0x7FFFFFFFFFFFFFFF);
    }

    public static unsafe double fabs_pointer(double v)
    {
        ulong u = *(ulong*)&v & 0x7FFFFFFFFFFFFFFF;
        return *(double*)&u;
    }

    public static double fabs_manual_vectorization(double v)
    {
        return Sse2.And(
            Vector128.CreateScalarUnsafe(v),
            Vector128.Create(0x7FFFFFFFFFFFFFFF).AsDouble()).ToScalar();
    }

    [StructLayout(LayoutKind.Explicit)]
    public struct S
    {
        [FieldOffset(0)] public double d;
        [FieldOffset(0)] public ulong u;
    }

    public static double fabs_struct(double v)
    {
        var s = new S { d = v };
        s.u &= 0x7FFFFFFFFFFFFFFF;
        return s.d;
    }
}

but the ryujit's codegen varies a lot:

; Core CLR 6.0.21.52210 on amd64

C..ctor()
    L0000: ret

C.fabs(Double)
    L0000: vzeroupper
    L0003: vmovq rax, xmm0
    L0008: mov rdx, 0x7fffffffffffffff
    L0012: and rax, rdx
    L0015: vmovq xmm0, rax
    L001a: ret

C.fabs_pointer(Double)
    L0000: vzeroupper
    L0003: vmovsd [rsp+8], xmm0
    L0009: mov rax, 0x7fffffffffffffff
    L0013: and rax, [rsp+8]
    L0018: vmovq xmm0, rax
    L001d: ret

C.fabs_manual_vectorization(Double)
    L0000: vzeroupper
    L0003: vandpd xmm0, xmm0, [0x7ffb39e304b0]
    L000b: ret

C.fabs_struct(Double)
    L0000: sub rsp, 0x18
    L0004: vzeroupper
    L0007: vmovsd [rsp+8], xmm0
    L000d: mov rax, [rsp+8]
    L0012: mov [rsp+0x10], rax
    L0017: lea rax, [rsp+0x10]
    L001c: mov rdx, 0x7fffffffffffffff
    L0026: and [rax], rdx
    L0029: vmovsd xmm0, [rsp+0x10]
    L002f: add rsp, 0x18
    L0033: ret

In the ideal world, pointer indirection and explicit layout would give the same codegen as the manually vectorized variant.

category:cq theme:vector-codegen skill-level:expert cost:small impact:small

ghost commented 2 years ago

Tagging subscribers to this area: @JulieLeeMSFT See info in area-owners.md if you want to be subscribed.

Issue Details
Consider two forms of writing fabs() math function in C: ```C #include double fabs_pointer(double value) { uint64_t u = *(uint64_t*)&value & 0x7FFFFFFFFFFFFFFF; return *(double*)&u; } double fabs_union(double value) { union { double f; uint64_t i; } u = { x }; u.i &= -1ULL / 2; return u.f; } ``` this code modifies raw bits of `double` input in-place and returns the modified value. Clang with `-O2 -march=ivybridge` gives identical codegen: ```asm .LCPI0_0: .quad 0x7fffffffffffffff # double NaN .quad 0x7fffffffffffffff # double NaN fabs_pointer: # @fabs_pointer vandps xmm0, xmm0, xmmword ptr [rip + .LCPI0_0] ret .LCPI1_0: .quad 0x7fffffffffffffff # double NaN .quad 0x7fffffffffffffff # double NaN fabs_union: # @fabs_union vandps xmm0, xmm0, xmmword ptr [rip + .LCPI1_0] ret ``` We can write the same code in C# in four different forms: ```c# using System; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; class C { static double fabs(double v) { ulong u = BitConverter.DoubleToUInt64Bits(v); return BitConverter.UInt64BitsToDouble(u & 0x7FFFFFFFFFFFFFFF); } static unsafe double fabs_pointer(double v) { ulong u = *(ulong*)&v & 0x7FFFFFFFFFFFFFFF; return *(double*)&u; } static double fabs_manual_vectorization(double v) { return Sse2.And( Vector128.CreateScalarUnsafe(v), Vector128.Create(0x7FFFFFFFFFFFFFFF).AsDouble()).ToScalar(); } [StructLayout(LayoutKind.Explicit)] struct S { [FieldOffset(0)] public double d; [FieldOffset(0)] public ulong u; } static double fabs_struct(double v) { var s = new S { d = v }; s.u &= 0x7FFFFFFFFFFFFFFF; return s.d; } } ``` but the ryujit's codegen varies a lot: ```asm ; Core CLR 6.0.21.52210 on amd64 C..ctor() L0000: ret C.fabs(Double) L0000: vzeroupper L0003: vmovq rax, xmm0 L0008: mov rdx, 0x7fffffffffffffff L0012: and rax, rdx L0015: vmovq xmm0, rax L001a: ret C.fabs_pointer(Double) L0000: vzeroupper L0003: vmovsd [rsp+8], xmm0 L0009: mov rax, 0x7fffffffffffffff L0013: and rax, [rsp+8] L0018: vmovq xmm0, rax L001d: ret C.fabs_manual_vectorization(Double) L0000: vzeroupper L0003: vandpd xmm0, xmm0, [0x7ffb39e304b0] L000b: ret C.fabs_struct(Double) L0000: sub rsp, 0x18 L0004: vzeroupper L0007: vmovsd [rsp+8], xmm0 L000d: mov rax, [rsp+8] L0012: mov [rsp+0x10], rax L0017: lea rax, [rsp+0x10] L001c: mov rdx, 0x7fffffffffffffff L0026: and [rax], rdx L0029: vmovsd xmm0, [rsp+0x10] L002f: add rsp, 0x18 L0033: ret ``` In the ideal world, pointer indirection and explicit layout would give the same codegen as the manually vectorized variant.
Author: am11
Assignees: -
Labels: `area-CodeGen-coreclr`, `untriaged`
Milestone: -
EgorBo commented 2 years ago

I think we need a fix for https://github.com/dotnet/runtime/issues/11413 first

SingleAccretion commented 2 years ago

I would say "not really". The lea is from #51825, perhaps that'll be easy enough to fix, but fundamentally, we need two things for unions to become "optimal": 1) BITCAST in the frontend, aka #11413. 2) Morph accesses to overlapping fields into those BITCASTs on top of other fields. This will unblock promotion and enregistration (at least for one-field cases such as this one).

am11 commented 2 years ago

Found a minor inefficiency in unsafe method with daily build 7ab7f83fb1 (7.0.100-preview.6.22322.4).

Setup (NativeAOT):

$ dotnet7 publish --use-current-runtime -c Release -o dist \
    -p:PublishAot=true -p:AllowUnsafeBlocks=true -p:NativeLib=shared -p:IlcInstructionSet=avx2

$ gdb dist/nativelib1.so -batch \
    -ex "set disassembly-flavor intel" \
    -ex "disassemble nativelib1_C__fabs" \
    -ex "disassemble nativelib1_C__fabs_pointer" \
    -ex "disassemble nativelib1_C__fabs_manual_vectorization" \
    -ex "disassemble nativelib1_C__fabs_struct"

Disassembly:

Dump of assembler code for function nativelib1_C__fabs:
   0x0000000000217590 <+0>: vzeroupper 
   0x0000000000217593 <+3>: vmovq  rax,xmm0
   0x0000000000217598 <+8>: movabs rdi,0x7fffffffffffffff
   0x00000000002175a2 <+18>:    and    rax,rdi
   0x00000000002175a5 <+21>:    vmovq  xmm0,rax
   0x00000000002175aa <+26>:    ret    
End of assembler dump.
Dump of assembler code for function nativelib1_C__fabs_pointer:
   0x00000000002175b0 <+0>: push   rax
   0x00000000002175b1 <+1>: vzeroupper 
   0x00000000002175b4 <+4>: vmovsd QWORD PTR [rsp],xmm0
   0x00000000002175b9 <+9>: movabs rax,0x7fffffffffffffff
   0x00000000002175c3 <+19>:    and    rax,QWORD PTR [rsp]
   0x00000000002175c7 <+23>:    vmovq  xmm0,rax
   0x00000000002175cc <+28>:    add    rsp,0x8
   0x00000000002175d0 <+32>:    ret    
End of assembler dump.
Dump of assembler code for function nativelib1_C__fabs_manual_vectorization:
   0x00000000002175e0 <+0>: vzeroupper 
   0x00000000002175e3 <+3>: vandpd xmm0,xmm0,XMMWORD PTR [rip+0x10fd35]        # 0x327320
   0x00000000002175eb <+11>:    ret    
End of assembler dump.
Dump of assembler code for function nativelib1_C__fabs_struct:
   0x00000000002175f0 <+0>: sub    rsp,0x18
   0x00000000002175f4 <+4>: vzeroupper 
   0x00000000002175f7 <+7>: vmovsd QWORD PTR [rsp+0x8],xmm0
   0x00000000002175fd <+13>:    mov    rax,QWORD PTR [rsp+0x8]
   0x0000000000217602 <+18>:    mov    QWORD PTR [rsp+0x10],rax
   0x0000000000217607 <+23>:    lea    rax,[rsp+0x10]
   0x000000000021760c <+28>:    movabs rdi,0x7fffffffffffffff
   0x0000000000217616 <+38>:    and    QWORD PTR [rax],rdi
   0x0000000000217619 <+41>:    vmovsd xmm0,QWORD PTR [rsp+0x10]
   0x000000000021761f <+47>:    add    rsp,0x18
   0x0000000000217623 <+51>:    ret    
End of assembler dump.

push and add in fabs_pointer look redundant.

tannergooding commented 2 years ago

I'm not sure its worth spending cycles optimizing a pattern which has a better alternative we can push users towards instead.

This is very much a case where I think we should push devs to using the built-in APIs:

As these will, across the board, be optimized and handled better in a multitude of contexts. They also won't have other issues that might crop up from the local being "address taken" or similar.

am11 commented 2 years ago

I agree that C.fabs_struct() from the code sample above is probably not worth special handling (unless there are some overarching benefits to handle single field struct in a special way). Same for pointer math (direct algorithm translation from C) in C.fabs_pointer() (it just seem to have regressed a little since January, which I was pointing out).

However, C.fabs() is, in fact, using BitConverter but the codegen is not optimal/vectorized. Perhaps forward substitution is currently not handling those disjoint statements well?

Splitting C.fabs_manual_vectorization() implementation into multiple statements is now handled by forwardsub and it produces the same code as nativelib1_C__fabs_manual_vectorization shown above with avx, and similar compact code with sse series (vandpd turns into andpd); since https://github.com/dotnet/runtime/pull/70587.

tannergooding commented 2 years ago

Devs should never need to fabs in the first place because Math.Abs is already doing "the right thing".

The optimal implementation if you were to do it "manually" would be (Vector128.CreateScalarUnsafe(value) & Vector128.CreateScalarUnsafe(-0.0)).ToScalar() and that does the right thing.

tannergooding commented 2 years ago

There is probably room to improve the handling around BitConverter.DoubleToUInt64Bits and BitConverter.UInt64BitsToDouble for constant operands, but even then its something that is likely rare overall given the common use-cases.

am11 commented 2 years ago

Math.Abs

Indeed, "possible auto-vectorization opportunities" was the actual intent of this thread, fabs is used as an example because it is simpler to demonstrate the codegen differences in four ways of implementing the same algorithm. In general, it is about operating on raw/integral representation of double or float objects.

am11 commented 1 year ago

From PR #78741, EnumConverter.cs has constructs similar to fabs_pointer case mentioned above:

// related issues:
//    https://github.com/dotnet/runtime/issues/10315
//    https://github.com/dotnet/runtime/issues/55357
static class C
{
   static unsafe ushort narrow(ulong v) => *(ushort*)&v;
   static unsafe ulong widen(ushort v) => *(ulong*)&v;
   static unsafe ushort unchanged(ushort v) => *(ushort*)&v;
}
- ; current codegen
+ ; proposed codegen

; https://godbolt.org/z/r1acfxao6

C.narrow(UInt64)
-    mov [rsp+0x10], rdx
-    movzx eax, word ptr [rsp+0x10]
+    movzx eax, dx
    ret

C.widen(UInt16)
-    mov [rsp+0x10], edx
-    mov rax, [rsp+0x10]
+    movzx rax, dx
    ret

C.unchanged(UInt16)
    movzx eax, cx
    ret