llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
28.46k stars 11.76k forks source link

Failure to leverage existing memory location when spilling #39880

Open preames opened 5 years ago

preames commented 5 years ago
Bugzilla Link 40534
Version trunk
OS Linux
CC @hfinkel,@pogo59,@qcolombet,@serguei-katkov

Extended Description

Filing this under register allocator, but really not sure that's the appropriate place to fix it. The basic problem - illustrated by the test below - is that we can have a value live in a loop which changes on every iteration, but already has a memory location available w/said contents. Instead of inserting new spill slots, it would be ideal to reuse the existing locations.

This particular example was motivated by our statepoint lowering - which unfortunately, appears to create such idioms - but the problem is visible in arbitrary IR as well. For statepoints, doing this at the MI layer would be preferred, but I see a lot of argument for this being a missed IR level PRE-like store transform as well.

declare void @​clobber() declare void @​use(...)

define void @​test(i64 %p, i64 %cnt) { entry: %p1 = getelementptr i64, i64 %p, i64 1 %p2 = getelementptr i64, i64 %p, i64 2 %p3 = getelementptr i64, i64 %p, i64 3 %p4 = getelementptr i64, i64 %p, i64 4 %p5 = getelementptr i64, i64 %p, i64 5 %p6 = getelementptr i64, i64 %p, i64 6 %p7 = getelementptr i64, i64 %p, i64 7 %p8 = getelementptr i64, i64 %p, i64 8 %p9 = getelementptr i64, i64 %p, i64 9 %p10 = getelementptr i64, i64 %p, i64 10 %p11 = getelementptr i64, i64 %p, i64 11 %p12 = getelementptr i64, i64 %p, i64 12 %p13 = getelementptr i64, i64 %p, i64 13 %p14 = getelementptr i64, i64* %p, i64 14

store i64 0, i64 %p store i64 0, i64 %p1 store i64 0, i64 %p2 store i64 0, i64 %p3 store i64 0, i64 %p4 store i64 0, i64 %p5 store i64 0, i64 %p6 store i64 0, i64 %p7 store i64 0, i64 %p8 store i64 0, i64 %p9 store i64 0, i64 %p10 store i64 0, i64 %p11 store i64 0, i64 %p12 store i64 0, i64 %p13 store i64 0, i64* %p14 br label %loop

loop: %iv = phi i64 [0, %entry], [%iv.next, %loop]

%v1 = phi i64 [0, %entry], [%v2, %loop] %v1p1 = phi i64 [0, %entry], [%v2p1, %loop] %v1p2 = phi i64 [0, %entry], [%v2p2, %loop] %v1p3 = phi i64 [0, %entry], [%v2p3, %loop] %v1p4 = phi i64 [0, %entry], [%v2p4, %loop] %v1p5 = phi i64 [0, %entry], [%v2p5, %loop] %v1p6 = phi i64 [0, %entry], [%v2p6, %loop] %v1p7 = phi i64 [0, %entry], [%v2p7, %loop] %v1p8 = phi i64 [0, %entry], [%v2p8, %loop] %v1p9 = phi i64 [0, %entry], [%v2p9, %loop] %v1p10 = phi i64 [0, %entry], [%v2p10, %loop] %v1p11 = phi i64 [0, %entry], [%v2p11, %loop] %v1p12 = phi i64 [0, %entry], [%v2p12, %loop] %v1p13 = phi i64 [0, %entry], [%v2p13, %loop] %v1p14 = phi i64 [0, %entry], [%v2p14, %loop] store i64 %v1, i64 %p store i64 %v1p1, i64 %p1 store i64 %v1p2, i64 %p2 store i64 %v1p3, i64 %p3 store i64 %v1p4, i64 %p4 store i64 %v1p5, i64 %p5 store i64 %v1p6, i64 %p6 store i64 %v1p7, i64 %p7 store i64 %v1p8, i64 %p8 store i64 %v1p9, i64 %p9 store i64 %v1p9, i64 %p10 store i64 %v1p10, i64 %p11 store i64 %v1p12, i64 %p12 store i64 %v1p13, i64 %p13 store i64 %v1p14, i64 %p14 call void @​clobber() %v2 = load i64, i64 %p %v2p1 = load i64, i64 %p1 %v2p2 = load i64, i64 %p2 %v2p3 = load i64, i64 %p3 %v2p4 = load i64, i64 %p4 %v2p5 = load i64, i64 %p5 %v2p6 = load i64, i64 %p6 %v2p7 = load i64, i64 %p7 %v2p8 = load i64, i64 %p8 %v2p9 = load i64, i64 %p9 %v2p10 = load i64, i64 %p10 %v2p11 = load i64, i64 %p11 %v2p12 = load i64, i64 %p12 %v2p13 = load i64, i64 %p13 %v2p14 = load i64, i64 %p14

%iv.next = add i64 %iv, 1 %exit.cmp = icmp sgt i64 %iv, 200 br i1 %exit.cmp, label %exit, label %loop

exit: call void (...) @​use(i64 %v2) ret void }

$ ../build/bin/opt -O1 -S loop-stld.ll | ../build/bin/llc -O3 .text .file "loop-stld.ll" .globl test # -- Begin function test .p2align 4, 0x90 .type test,@function test: # @​test .cfi_startproc

%bb.0: # %entry

pushq   %rbp
.cfi_def_cfa_offset 16
pushq   %r15
.cfi_def_cfa_offset 24
pushq   %r14
.cfi_def_cfa_offset 32
pushq   %r13
.cfi_def_cfa_offset 40
pushq   %r12
.cfi_def_cfa_offset 48
pushq   %rbx
.cfi_def_cfa_offset 56
subq    $24, %rsp
.cfi_def_cfa_offset 80
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq    %rdi, %rbx
movq    $0, 112(%rdi)
movq    $0, 104(%rdi)
movq    $0, 96(%rdi)
movq    $0, 88(%rdi)
movq    $0, 80(%rdi)
movq    $0, 72(%rdi)
movq    $0, 64(%rdi)
movq    $0, 56(%rdi)
movq    $0, 48(%rdi)
movq    $0, 40(%rdi)
movq    $0, 32(%rdi)
movq    $0, 24(%rdi)
movq    $0, 16(%rdi)
movq    $0, 8(%rdi)
movq    $0, (%rdi)
movq    $-1, %r14
xorl    %eax, %eax
movq    %rax, 8(%rsp)           # 8-byte Spill
xorl    %edi, %edi
xorl    %esi, %esi
xorl    %ecx, %ecx
xorl    %edx, %edx
xorl    %ebp, %ebp
xorl    %r12d, %r12d
xorl    %r13d, %r13d
xorl    %r10d, %r10d
xorl    %eax, %eax
xorl    %r11d, %r11d
xorl    %r15d, %r15d
xorl    %r9d, %r9d
xorl    %r8d, %r8d
.p2align    4, 0x90

.LBB0_1: # %loop

=>This Inner Loop Header: Depth=1

movq    %rbp, 16(%rsp)          # 8-byte Spill
movq    %rdi, %rbp
movq    8(%rsp), %rdi           # 8-byte Reload <-- PROBLEM
movq    %rdi, (%rbx)
movq    %rbp, 8(%rbx)
movq    %rsi, 16(%rbx)
movq    %rcx, 24(%rbx)
movq    %rdx, 32(%rbx)
movq    16(%rsp), %rcx          # 8-byte Reload <-- PROBLEM
movq    %rcx, 40(%rbx)
movq    %r12, 48(%rbx)
movq    %r13, 56(%rbx)
movq    %r10, 64(%rbx)
movq    %rax, 72(%rbx)
movq    %rax, 80(%rbx)
movq    %r11, 88(%rbx)
movq    %r15, 96(%rbx)
movq    %r9, 104(%rbx)
movq    %r8, 112(%rbx)
callq   clobber
movq    (%rbx), %rax
movq    %rax, 8(%rsp)           # 8-byte Spill
movq    8(%rbx), %rdi
movq    16(%rbx), %rsi
movq    24(%rbx), %rcx
movq    32(%rbx), %rdx
movq    40(%rbx), %rbp
movq    48(%rbx), %r12
movq    56(%rbx), %r13
movq    64(%rbx), %r10
movq    72(%rbx), %rax
movq    80(%rbx), %r11
movq    96(%rbx), %r15
movq    104(%rbx), %r9
movq    112(%rbx), %r8
incq    %r14
cmpq    $201, %r14
jb  .LBB0_1

%bb.2: # %exit

movq    8(%rsp), %rdi           # 8-byte Reload
xorl    %eax, %eax
addq    $24, %rsp
.cfi_def_cfa_offset 56
popq    %rbx
.cfi_def_cfa_offset 48
popq    %r12
.cfi_def_cfa_offset 40
popq    %r13
.cfi_def_cfa_offset 32
popq    %r14
.cfi_def_cfa_offset 24
popq    %r15
.cfi_def_cfa_offset 16
popq    %rbp
.cfi_def_cfa_offset 8
jmp use                     # TAILCALL

.Lfunc_end0: .size test, .Lfunc_end0-test .cfi_endproc

-- End function

.section    ".note.GNU-stack","",@progbits
serguei-katkov commented 5 years ago

I'm sorry, below is a bit more correct test (with phi nodes for pointers): declare void @​clobber() declare void @​use(...)

define void @​test(i64 %p, i64 %cnt, i32 addrspace(1) %a, i32 addrspace(1) %b, i32 addrspace(1) %c) #​1 gc "statepoint-example" { entry: %p1 = getelementptr i64, i64 %p, i64 1 %p2 = getelementptr i64, i64 %p, i64 2 %p3 = getelementptr i64, i64 %p, i64 3 %p4 = getelementptr i64, i64 %p, i64 4 %p5 = getelementptr i64, i64 %p, i64 5 %p6 = getelementptr i64, i64 %p, i64 6 %p7 = getelementptr i64, i64 %p, i64 7 %p8 = getelementptr i64, i64 %p, i64 8 %p9 = getelementptr i64, i64 %p, i64 9 %p10 = getelementptr i64, i64 %p, i64 10 %p11 = getelementptr i64, i64 %p, i64 11 %p12 = getelementptr i64, i64 %p, i64 12 %p13 = getelementptr i64, i64 %p, i64 13 %p14 = getelementptr i64, i64 %p, i64 14

store i64 0, i64 %p store i64 0, i64 %p1 store i64 0, i64 %p2 store i64 0, i64 %p3 store i64 0, i64 %p4 store i64 0, i64 %p5 store i64 0, i64 %p6 store i64 0, i64 %p7 store i64 0, i64 %p8 store i64 0, i64 %p9 store i64 0, i64 %p10 store i64 0, i64 %p11 store i64 0, i64 %p12 store i64 0, i64 %p13 store i64 0, i64* %p14 br label %loop

loop: %pa = phi i32 addrspace(1) [%a, %entry], [%a1, %loop] %pb = phi i32 addrspace(1) [%a, %entry], [%b1, %loop] %pc = phi i32 addrspace(1) [%a, %entry], [%c1, %loop] %iv = phi i64 [0, %entry], [%iv.next, %loop] %v1 = phi i64 [0, %entry], [%v2, %loop] %v1p1 = phi i64 [0, %entry], [%v2p1, %loop] %v1p2 = phi i64 [0, %entry], [%v2p2, %loop] %v1p3 = phi i64 [0, %entry], [%v2p3, %loop] %v1p4 = phi i64 [0, %entry], [%v2p4, %loop] %v1p5 = phi i64 [0, %entry], [%v2p5, %loop] %v1p6 = phi i64 [0, %entry], [%v2p6, %loop] %v1p7 = phi i64 [0, %entry], [%v2p7, %loop] %v1p8 = phi i64 [0, %entry], [%v2p8, %loop] %v1p9 = phi i64 [0, %entry], [%v2p9, %loop] %v1p10 = phi i64 [0, %entry], [%v2p10, %loop] %v1p11 = phi i64 [0, %entry], [%v2p11, %loop] %v1p12 = phi i64 [0, %entry], [%v2p12, %loop] %v1p13 = phi i64 [0, %entry], [%v2p13, %loop] %v1p14 = phi i64 [0, %entry], [%v2p14, %loop] store i64 %v1, i64 %p store i64 %v1p1, i64 %p1 store i64 %v1p2, i64 %p2 store i64 %v1p3, i64 %p3 store i64 %v1p4, i64 %p4 store i64 %v1p5, i64 %p5 store i64 %v1p6, i64 %p6 store i64 %v1p7, i64 %p7 store i64 %v1p8, i64 %p8 store i64 %v1p9, i64 %p9 store i64 %v1p10, i64 %p10 store i64 %v1p11, i64 %p11 store i64 %v1p12, i64 %p12 store i64 %v1p13, i64 %p13 store i64 %v1p14, i64 %p14 %safepoint_token = tail call token (i64, i32, void (), i32, i32, ...) @​llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void () @​clobber, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1) %pa, i32 addrspace(1) %pb, i32 addrspace(1) %pc) %a1 = tail call coldcc i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12) %b1 = tail call coldcc i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13) %c1 = tail call coldcc i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14) %v2 = load i64, i64 %p %v2p1 = load i64, i64 %p1 %v2p2 = load i64, i64 %p2 %v2p3 = load i64, i64 %p3 %v2p4 = load i64, i64 %p4 %v2p5 = load i64, i64 %p5 %v2p6 = load i64, i64 %p6 %v2p7 = load i64, i64 %p7 %v2p8 = load i64, i64 %p8 %v2p9 = load i64, i64 %p9 %v2p10 = load i64, i64 %p10 %v2p11 = load i64, i64 %p11 %v2p12 = load i64, i64 %p12 %v2p13 = load i64, i64 %p13 %v2p14 = load i64, i64* %p14

%iv.next = add i64 %iv, 1 %exit.cmp = icmp sgt i64 %iv, 200 br i1 %exit.cmp, label %exit, label %loop

exit: call void (...) @​use(i64 %v2) ret void }

declare i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token, i32, i32) #​3 declare i8 addrspace(1) @​llvm.experimental.gc.relocate.p1i8(token, i32, i32) #​3

declare token @​llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)

declare i32 @"personality_function"()

attributes #​1 = { uwtable }

And here is a loop: .LBB0_1: # %loop

=>This Inner Loop Header: Depth=1

movq    %r11, 64(%rsp)          # 8-byte Spill
movq    8(%rsp), %r11           # 8-byte Reload
movq    %r11, (%r15)
movq    %rcx, 8(%r15)
movq    %rsi, 16(%r15)
movq    %rax, 24(%r15)
movq    %rbp, 32(%r15)
movq    %rdx, 40(%r15)
movq    %rdi, 48(%r15)
movq    64(%rsp), %rax          # 8-byte Reload
movq    %rax, 56(%r15)
movq    16(%rsp), %rax          # 8-byte Reload
movq    %rax, 64(%r15)
movq    %rbx, 72(%r15)
movq    %r8, 80(%r15)
movq    %r12, 88(%r15)
movq    %r9, 96(%r15)
movq    %r10, 104(%r15)
movq    %r13, 112(%r15)
movq    24(%rsp), %rax          # 8-byte Reload
movq    %rax, 56(%rsp)
movq    40(%rsp), %rax          # 8-byte Reload
movq    %rax, (%rsp)
movq    32(%rsp), %rax          # 8-byte Reload
movq    %rax, 48(%rsp)
callq   clobber

.Ltmp0: movq (%rsp), %rax movq %rax, 40(%rsp) # 8-byte Spill movq 48(%rsp), %rax movq %rax, 32(%rsp) # 8-byte Spill movq 56(%rsp), %rax movq %rax, 24(%rsp) # 8-byte Spill movq (%r15), %rax movq %rax, 8(%rsp) # 8-byte Spill movq 8(%r15), %rcx movq 16(%r15), %rsi movq 24(%r15), %rax movq 32(%r15), %rbp movq 40(%r15), %rdx movq 48(%r15), %rdi movq 56(%r15), %r11 movq 64(%r15), %rbx movq %rbx, 16(%rsp) # 8-byte Spill movq 72(%r15), %rbx movq 80(%r15), %r8 movq 88(%r15), %r12 incq %r14 cmpq $201, %r14 movq 96(%r15), %r9 movq 104(%r15), %r10 movq 112(%r15), %r13 jl .LBB0_1

serguei-katkov commented 5 years ago

Here is slightly updated test published by Philip. It adds the usage of statepoint intrinsic with pointers.

It shows that after llc produces the move from one stack slot to another stack slot before the call and do the same after call.

Stack Slot Coloring does not help here in my understanding due to it based on LiveStack analysis. It does nothing but re-use the result of register allocator.

But it seems that register allocator does not fill the live stack intervals for other spills than it introduces while in this example stack slots are introduces at the moment of statepoint intrinsic lowering.

Here is an output from Stack Slot Coloring pass: ** Stack Slot Coloring ** ** Function: test Spill slot intervals: SS#3 [16r,1808B:0) 0@x weight:3.287500e+01 SS#4 [32r,1808B:0) 0@x weight:3.287500e+01 SS#5 [48r,1808B:0) 0@x weight:3.287500e+01 SS#6 [336r,896r:0)[1232r,1872r:0) 0@x weight:6.575000e+01 SS#7 [512r,1024r:0)[1360r,1808B:0) 0@x weight:6.475000e+01 SS#8 [624B,1008r:0) 0@x weight:6.375000e+01

Color spill slot intervals: Assigning fi#6 to fi#3 Assigning fi#7 to fi#4 Assigning fi#8 to fi#5 Assigning fi#3 to fi#6 Assigning fi#4 to fi#7 Assigning fi#5 to fi#8

Spill slots after coloring: SS#3 [16r,1808B:0) 0@x weight:6.575000e+01 SS#4 [32r,1808B:0) 0@x weight:6.475000e+01 SS#5 [48r,1808B:0) 0@x weight:6.375000e+01 SS#6 [336r,896r:0)[1232r,1872r:0) 0@x weight:3.287500e+01 SS#7 [512r,1024r:0)[1360r,1808B:0) 0@x weight:3.287500e+01 SS#8 [624B,1008r:0) 0@x weight:3.287500e+01

we can see that stack slots 0-2 introduced by statepoint lowering are not considered at all. As a result it keeps that moving values from stack slots introduced by register allocator to stack slots introduced by statepoint instruction lowering.

I tend to think that we need to update LiveStack analysis with filling information for stack slots which are not introduced by register allocator.

I guess allocas should suffer from the same.

The modified test: declare void @​clobber() declare void @​use(...)

define void @​test(i64 %p, i64 %cnt, i32 addrspace(1) %a, i32 addrspace(1) %b, i32 addrspace(1) %c) #​1 gc "statepoint-example" { entry: %p1 = getelementptr i64, i64 %p, i64 1 %p2 = getelementptr i64, i64 %p, i64 2 %p3 = getelementptr i64, i64 %p, i64 3 %p4 = getelementptr i64, i64 %p, i64 4 %p5 = getelementptr i64, i64 %p, i64 5 %p6 = getelementptr i64, i64 %p, i64 6 %p7 = getelementptr i64, i64 %p, i64 7 %p8 = getelementptr i64, i64 %p, i64 8 %p9 = getelementptr i64, i64 %p, i64 9 %p10 = getelementptr i64, i64 %p, i64 10 %p11 = getelementptr i64, i64 %p, i64 11 %p12 = getelementptr i64, i64 %p, i64 12 %p13 = getelementptr i64, i64 %p, i64 13 %p14 = getelementptr i64, i64 %p, i64 14

store i64 0, i64 %p store i64 0, i64 %p1 store i64 0, i64 %p2 store i64 0, i64 %p3 store i64 0, i64 %p4 store i64 0, i64 %p5 store i64 0, i64 %p6 store i64 0, i64 %p7 store i64 0, i64 %p8 store i64 0, i64 %p9 store i64 0, i64 %p10 store i64 0, i64 %p11 store i64 0, i64 %p12 store i64 0, i64 %p13 store i64 0, i64* %p14 br label %loop

loop: %iv = phi i64 [0, %entry], [%iv.next, %loop]

%v1 = phi i64 [0, %entry], [%v2, %loop] %v1p1 = phi i64 [0, %entry], [%v2p1, %loop] %v1p2 = phi i64 [0, %entry], [%v2p2, %loop] %v1p3 = phi i64 [0, %entry], [%v2p3, %loop] %v1p4 = phi i64 [0, %entry], [%v2p4, %loop] %v1p5 = phi i64 [0, %entry], [%v2p5, %loop] %v1p6 = phi i64 [0, %entry], [%v2p6, %loop] %v1p7 = phi i64 [0, %entry], [%v2p7, %loop] %v1p8 = phi i64 [0, %entry], [%v2p8, %loop] %v1p9 = phi i64 [0, %entry], [%v2p9, %loop] %v1p10 = phi i64 [0, %entry], [%v2p10, %loop] %v1p11 = phi i64 [0, %entry], [%v2p11, %loop] %v1p12 = phi i64 [0, %entry], [%v2p12, %loop] %v1p13 = phi i64 [0, %entry], [%v2p13, %loop] %v1p14 = phi i64 [0, %entry], [%v2p14, %loop] store i64 %v1, i64 %p store i64 %v1p1, i64 %p1 store i64 %v1p2, i64 %p2 store i64 %v1p3, i64 %p3 store i64 %v1p4, i64 %p4 store i64 %v1p5, i64 %p5 store i64 %v1p6, i64 %p6 store i64 %v1p7, i64 %p7 store i64 %v1p8, i64 %p8 store i64 %v1p9, i64 %p9 store i64 %v1p10, i64 %p10 store i64 %v1p11, i64 %p11 store i64 %v1p12, i64 %p12 store i64 %v1p13, i64 %p13 store i64 %v1p14, i64 %p14 %safepoint_token = tail call token (i64, i32, void (), i32, i32, ...) @​llvm.experimental.gc.statepoint.p0f_isVoidf(i64 0, i32 0, void () @​clobber, i32 0, i32 0, i32 0, i32 5, i32 0, i32 -1, i32 0, i32 0, i32 0, i32 addrspace(1) %a, i32 addrspace(1) %b, i32 addrspace(1) %c) %a1 = tail call coldcc i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 12) %b1 = tail call coldcc i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 13) %c1 = tail call coldcc i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token %safepoint_token, i32 12, i32 14) %v2 = load i64, i64 %p %v2p1 = load i64, i64 %p1 %v2p2 = load i64, i64 %p2 %v2p3 = load i64, i64 %p3 %v2p4 = load i64, i64 %p4 %v2p5 = load i64, i64 %p5 %v2p6 = load i64, i64 %p6 %v2p7 = load i64, i64 %p7 %v2p8 = load i64, i64 %p8 %v2p9 = load i64, i64 %p9 %v2p10 = load i64, i64 %p10 %v2p11 = load i64, i64 %p11 %v2p12 = load i64, i64 %p12 %v2p13 = load i64, i64 %p13 %v2p14 = load i64, i64 %p14

%iv.next = add i64 %iv, 1 %exit.cmp = icmp sgt i64 %iv, 200 br i1 %exit.cmp, label %exit, label %loop

exit: call void (...) @​use(i64 %v2) ret void }

declare i32 addrspace(1) @​llvm.experimental.gc.relocate.p1i32(token, i32, i32) #​3 declare i8 addrspace(1) @​llvm.experimental.gc.relocate.p1i8(token, i32, i32) #​3

declare token @​llvm.experimental.gc.statepoint.p0f_isVoidf(i64, i32, void ()*, i32, i32, ...)

declare i32 @"personality_function"()

attributes #​1 = { uwtable }

The llc result for (without stack map): .text .file "test.ll" .globl test # -- Begin function test .p2align 4, 0x90 .type test,@function test: # @​test .cfi_startproc

%bb.0: # %entry

pushq   %rbp
.cfi_def_cfa_offset 16
pushq   %r15
.cfi_def_cfa_offset 24
pushq   %r14
.cfi_def_cfa_offset 32
pushq   %r13
.cfi_def_cfa_offset 40
pushq   %r12
.cfi_def_cfa_offset 48
pushq   %rbx
.cfi_def_cfa_offset 56
subq    $72, %rsp
.cfi_def_cfa_offset 128
.cfi_offset %rbx, -56
.cfi_offset %r12, -48
.cfi_offset %r13, -40
.cfi_offset %r14, -32
.cfi_offset %r15, -24
.cfi_offset %rbp, -16
movq    %r8, 40(%rsp)           # 8-byte Spill
movq    %rcx, 32(%rsp)          # 8-byte Spill
movq    %rdx, 24(%rsp)          # 8-byte Spill
movq    %rdi, %r14
movq    $0, (%rdi)
movq    $0, 8(%rdi)
movq    $0, 16(%rdi)
movq    $0, 24(%rdi)
movq    $0, 32(%rdi)
movq    $0, 40(%rdi)
movq    $0, 48(%rdi)
movq    $0, 56(%rdi)
movq    $0, 64(%rdi)
movq    $0, 72(%rdi)
movq    $0, 80(%rdi)
movq    $0, 88(%rdi)
movq    $0, 96(%rdi)
movq    $0, 104(%rdi)
movq    $0, 112(%rdi)
movq    $-1, %r13
xorl    %eax, %eax
movq    %rax, 8(%rsp)           # 8-byte Spill
xorl    %eax, %eax
xorl    %ecx, %ecx
xorl    %edx, %edx
xorl    %esi, %esi
xorl    %ebp, %ebp
xorl    %edi, %edi
xorl    %r9d, %r9d
xorl    %ebx, %ebx
movq    %rbx, 16(%rsp)          # 8-byte Spill
xorl    %r10d, %r10d
xorl    %ebx, %ebx
xorl    %r15d, %r15d
xorl    %r12d, %r12d
xorl    %r8d, %r8d
xorl    %r11d, %r11d
.p2align    4, 0x90

.LBB0_1: # %loop

=>This Inner Loop Header: Depth=1

movq    %r9, 48(%rsp)           # 8-byte Spill
movq    8(%rsp), %r9            # 8-byte Reload
movq    %r9, (%r14)
movq    %rax, 8(%r14)
movq    %rcx, 16(%r14)
movq    %rdx, 24(%r14)
movq    %rsi, 32(%r14)
movq    %rbp, 40(%r14)
movq    %rdi, 48(%r14)
movq    48(%rsp), %rax          # 8-byte Reload
movq    %rax, 56(%r14)
movq    16(%rsp), %rax          # 8-byte Reload
movq    %rax, 64(%r14)
movq    %r10, 72(%r14)
movq    %rbx, 80(%r14)
movq    %r15, 88(%r14)
movq    %r12, 96(%r14)
movq    %r8, 104(%r14)
movq    %r11, 112(%r14)
movq    40(%rsp), %rax          # 8-byte Reload
movq    %rax, 64(%rsp)
movq    24(%rsp), %rax          # 8-byte Reload
movq    %rax, (%rsp)
movq    32(%rsp), %rax          # 8-byte Reload
movq    %rax, 56(%rsp)
callq   clobber

.Ltmp0: movq (%r14), %rax movq %rax, 8(%rsp) # 8-byte Spill movq 8(%r14), %rax movq 16(%r14), %rcx movq 24(%r14), %rdx movq 32(%r14), %rsi movq 40(%r14), %rbp movq 48(%r14), %rdi movq 56(%r14), %r9 movq 64(%r14), %rbx movq %rbx, 16(%rsp) # 8-byte Spill movq 72(%r14), %r10 movq 80(%r14), %rbx movq 88(%r14), %r15 movq 96(%r14), %r12 movq 104(%r14), %r8 movq 112(%r14), %r11 incq %r13 cmpq $201, %r13 jl .LBB0_1

%bb.2: # %exit

movq    8(%rsp), %rdi           # 8-byte Reload
xorl    %eax, %eax
callq   use
addq    $72, %rsp
.cfi_def_cfa_offset 56
popq    %rbx
.cfi_def_cfa_offset 48
popq    %r12
.cfi_def_cfa_offset 40
popq    %r13
.cfi_def_cfa_offset 32
popq    %r14
.cfi_def_cfa_offset 24
popq    %r15
.cfi_def_cfa_offset 16
popq    %rbp
.cfi_def_cfa_offset 8
retq

.Lfunc_end0: .size test, .Lfunc_end0-test .cfi_endproc

-- End function

preames commented 5 years ago

Quentin,

The symptom is that we end up with code which shuffles values between a stack slot, and the non-stack memory location. We could have avoided generating the stack spill slot entirely, and reloaded from the non-stack memory location instead if needed. (A reload isn't needed in the example.)

So, for this particular example, it isn't a stack colouring problem. However, I think the actual case I reduced this from - statepoint lowering - is a stack colouring problem since all of the memory locations are stack slots. Thanks for that observation.

qcolombet commented 5 years ago

I'm not sure I understand the problem.

  1. Is the problem that we use more spill slots than what we could?
  2. Is the problem that we insert more spill code than what we should?

For #​1, Regalloc has one spill slot per live-range. This is the job of the stack coloring pass to merge those.

Could you check what the debug output of that pass says?

For #​2, this would indeed be a regalloc improvement.

pogo59 commented 5 years ago

Keeping better track of existing stack homes can also help debug info. DWARF 5 has a "default location" notion, which is where the debugger should look if there's no specific (typically register) location attached to a given instruction range. Stack homes are the ideal candidate for a default location.