Different register allocation in multiple additions

blackgeorge-boom commented 1 year ago

#include <stdio.h>

int mul(int x) { return x * x; }

int add_6(int arg1, int arg2, int arg3, int arg4, int arg5, int arg6)
{
    int res = mul(1);

    int arg_sum = arg1 + arg2 + arg3 + arg4 + arg5 + arg6;

    return res + arg_sum;
}

int main()
{
    printf("%d\n", add_6(1, 2, 3, 4, 5, 6));
    return 0;
}

make clean; make stackmaps-check -j10 OPT_LEVEL=-O1

WARNING: add_6: callsite 0, value locations 1/1 have different location type (3 vs. 1)
WARNING: add_6: callsite 0, value locations 1/1 have different location offset or  different constant (-40 vs. 0)
WARNING: add_6: callsite 0, value locations 2/2 have different location offset or  different constant (-36 vs. -40)
WARNING: add_6: callsite 0, value locations 3/3 have different location offset or  different constant (-32 vs. -36)
WARNING: add_6: callsite 0, value locations 4/4 have different location offset or  different constant (-28 vs. -32)
WARNING: add_6: callsite 0, value locations 5/5 have different location type (1 vs. 3)
WARNING: add_6: callsite 0, value locations 5/5 have different location offset or  different constant (0 vs. -28)
ERROR: stackmaps in 'main_aarch64_aligned.out' & 'main_x86_64_aligned.out' differ - different stack layout!
make: *** [../../common/common.mk:239: stackmaps-check] Error 1

AArch64


0000000000501040 add_6:
...
  501050:       mov w20, w0 <---
  501054:       mov w0, #0x1
  501058:       mov w19, w5 <---
...
  501064:       bl  #-0x44 <mul>
  501068:       mov w2, w19 <---
  50106c:       ldp w3, w19, [sp, #0x8] <---
  501070:       ldp w0, w1, [sp, #0x10]
  501074:       add w3, w3, w20 <---
  501078:       add w3, w3, w19 <---
...

X86

0000000000501040 <add_6>:
...
  501059:   mov    r15d,esi <---
  50105c:   mov    ebx,edi <---
  50105e:   mov    edi,0x1
  501063:   call   501020 <mul>
  501068:   mov    r8d,DWORD PTR [rbp-0x1c]
  50106c:   mov    esi,DWORD PTR [rbp-0x20]
  50106f:   mov    edi,DWORD PTR [rbp-0x24]
  501072:   mov    edx,DWORD PTR [rbp-0x28]
  501075:   lea    ecx,[r15+rbx*1] <---
...

blackgeorge-boom commented 1 year ago

AArch64

240B      %7:gpr32 = COPY killed $w8 <---
256B      %8:gpr32 = nsw ADDWrr %1:gpr32, %0:gpr32
272B      %9:gpr32 = nsw ADDWrr %8:gpr32, %2:gpr32
288B      %10:gpr32 = nsw ADDWrr %9:gpr32, %3:gpr32
304B      %11:gpr32 = nsw ADDWrr %10:gpr32, %4:gpr32
320B      %12:gpr32 = nsw ADDWrr %11:gpr32, %5:gpr32
336B      %13:gpr32 = nsw ADDWrr %12:gpr32, %7:gpr32 <---
352B      $w8 = COPY %13:gpr32

selectOrSplit GPR32:%7 [240r,336r:0)  0@240r weight:4.072580e-03 w=4.072580e-03
hints: $w8
assigning %7 to $w8: W8 [240r,336r:0)  0@240r

selectOrSplit GPR32:%13 [336r,352r:0)  0@336r weight:INF w=INF
hints: $w8
assigning %13 to $w8: W8 [336r,352r:0)  0@336r

X86

240B      %13:gr32 = COPY killed $eax <---
288B      %10:gr32 = LEA64_32r %14:gr64_with_sub_8bit, 1, %15:gr64_nosp, 0, $noreg
320B      %10:gr32 = nsw ADD32rr %10:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
352B      %10:gr32 = nsw ADD32rr %10:gr32(tied-def 0), %3:gr32, implicit-def dead $eflags
384B      %10:gr32 = nsw ADD32rr %10:gr32(tied-def 0), %4:gr32, implicit-def dead $eflags
416B      %10:gr32 = nsw ADD32rr %10:gr32(tied-def 0), %5:gr32, implicit-def dead $eflags
448B      %13:gr32 = nsw ADD32rr %13:gr32(tied-def 0), %10:gr32, implicit-def dead $eflags <---
464B      $eax = COPY %13:gr32

selectOrSplit GR32:%13 [240r,448r:0)[448r,464r:1)  0@240r 1@448r weight:6.474359e-03 w=6.474359e-03
hints: $eax
assigning %13 to $eax: AH [240r,448r:0)[448r,464r:1)  0@240r 1@448r AL [240r,448r:0)[448r,464r:1)  0@240r 1@448r HAX [240r,448r:0)[448r,464r:1)  0@240r 1@448r

selectOrSplit GR32:%10 [288r,320r:2)[320r,352r:0)[352r,384r:1)[384r,416r:3)[416r,448r:4)  0@320r 1@352r 2@288r 3@384r 4@416r weight:INF w=INF
assigning %10 to $ecx: CH [288r,320r:2)[320r,352r:0)[352r,384r:1)[384r,416r:3)[416r,448r:4)  0@320r 1@352r 2@288r 3@384r 4@416r CL [288r,320r:2)[320r,352r:0)[352r,384r:1)[384r,416r:3)[416r,448r:4)  0@320r 1@352r 2@288r 3@384r 4@416r HCX [288r,320r:2)[320r,352r:0)[352r,384r:1)[384r,416r:3)[416r,448r:4)  0@320r 1@352r 2@288r 3@384r 4@416r

In AArch64, %7 and %13 can take the same register, while in X86 %13 and %10 cannot because of interference.

blackgeorge-boom commented 1 year ago

For that, we probably need to impose the two-address format on AArch64 as well (need to benchmark this).

systems-nuts / unifico

Different register allocation in multiple additions #283