dotnet / runtime

.NET is a cross-platform runtime for cloud, mobile, desktop, and IoT apps.
https://docs.microsoft.com/dotnet/core/
MIT License
15.45k stars 4.76k forks source link

Could the division codegen be improved on ARM64? #110212

Open neon-sunset opened 1 day ago

neon-sunset commented 1 day ago

Description

Given simple program

var u = int.Parse(args[0]);
var r = Random.Shared.Next(0, 10_000);
var a = new int[10_000];

for (var i = 0; i < 10_000; i++) {
    for (var j = 0; j < 100_000; j++) {
        a[i] += j % u;
    }
    a[i] += r;
}

Console.WriteLine(a[r]);

It appears that the loop body on ARM64 under NAOT compiles to

LAB_1000b3be8                                   XREF[1]:     1000b3c48(j)  
    mov        w3,wzr
    mov        w4,w1
LAB_1000b3bf0                                   XREF[1]:     1000b3c24(j)  
    ldr        w5,[x0, x4, LSL #0x2]
    cmp        w19,#0x0
    b.eq       LAB_1000b3c98 ;; <-- check div by zero
    cmn        w19,#0x1
    b.ne       LAB_1000b3c0c
    cmp        w3,#0x1
    b.vs       LAB_1000b3ca0 ;; <-- check overflow
LAB_1000b3c0c                                   XREF[1]:     1000b3c00(j)  
    sdiv       w6,w3,w19
    msub       w6,w6,w19,w3
    add        w5,w5,w6
    str        w5,[x0, x4, LSL #0x2]
    add        w3,w3,#0x1
    cmp        w3,w2
    b.lt       LAB_1000b3bf0
    lsl        x3,x4,#0x2
    add        x3,x0,x3
    ldr        w4,[x3]
    add        w4,w4,w20
    str        w4,[x3]
    add        w1,w1,#0x1
    mov        w3,#0x2710
    cmp        w1,w3
    b.lt       LAB_1000b3be8

Meanwhile the loop body of equivalent code in C is compiled by Clang -O3 to the following instead:

#include "stdio.h"
#include "stdlib.h"
#include "stdint.h"

int main(int argc, char **argv) {
  int u = atoi(argv[1]);
  int r = rand() % 10000;
  int32_t a[10000] = {0};
  for (int i = 0; i < 10000; i++) {
    for (int j = 0; j < 100000; j++) {
      a[i] = a[i] + j % u;
    }
    a[i] += r;
  }
  printf("%d\n", a[r]);
}
LAB_100003ec8                                   XREF[1]:     100003f10(j)  
    mov        w11,#0x0
    mov        w10,#0x0
    ldr        w12,[x22, x8, LSL #0x2]=>local_9c88
LAB_100003ed4                                   XREF[1]:     100003ef8(j)  
    sdiv       w13,w11,w19
    msub       w13,w13,w19,w11
    add        w14,w11,#0x1
    sdiv       w15,w14,w19
    msub       w14,w15,w19,w14
    add        w12,w13,w12
    add        w10,w14,w10
    add        w11,w11,#0x2
    cmp        w11,w9
    b.ne       LAB_100003ed4
    add        w10,w10,w12
    add        w10,w10,w20
    str        w10,[x22, x8, LSL #0x2]=>local_9c88
    add        x8,x8,#0x1
    cmp        x8,x21
    b.ne       LAB_100003ec8

Note - on x86_64 .NET does not emit any guards:

LAB_1000d130c                                   XREF[1]:     1000d1349(j)  
    XOR        j,j
    MOV        R8D,i
    NOP        dword ptr [RAX]
    NOP        dword ptr [RAX + RAX*0x1]
LAB_1000d1320                                   XREF[1]:     1000d1337(j)  
    MOV        EAX,j
    CDQ
    IDIV       u
    ADD        EDX,dword ptr [RDI + R8*0x4 + args+0x10]
    MOV        dword ptr [RDI + R8*0x4 + args+0x10],EDX
    INC        j
    CMP        j,0x186a0
    JL         LAB_1000d1320
    LEA        RAX,[RDI + R8*0x4 + args+0x10]
    ADD        dword ptr [RAX],r
    INC        i
    CMP        i,0x2710
    JL         LAB_1000d130c

Question - could this be improved on ARM64? Particularly around the fact that the range of j is known. Thanks!

Configuration

.NET SDK:
 Version:           9.0.100
 Commit:            59db016f11
 Workload version:  9.0.100-manifests.3068a692
 MSBuild version:   17.12.7+5b8665660

Runtime Environment:
 OS Name:     Mac OS X
 OS Version:  15.1
 OS Platform: Darwin
 RID:         osx-arm64
 Base Path:   /usr/local/share/dotnet/sdk/9.0.100/

Regression?

Likely not

dotnet-policy-service[bot] commented 1 day ago

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch See info in area-owners.md if you want to be subscribed.

EgorBo commented 1 day ago

The range of u is not known and has to be always checked I presume? Although, it can be hoisted from the loop. Overflow check can be, probably, fixed with help of IV cc @jakobbotsch

jakobbotsch commented 1 day ago

The range of u is not known and has to be always checked I presume? Although, it can be hoisted from the loop. Overflow check can be, probably, fixed with help of IV cc @jakobbotsch

Assertion prop would probably be better suited for something like this. It should be able to know that 0 <= j < 10000 inside the loop based on the comparisons made, and thus that the overflow is impossible. Or, I suppose, range check can do it as well...