Quuxplusone / LLVMBugzillaTest

0 stars 0 forks source link

Bad register allocation compiling GCC inline assembly (ARM) #16325

Open Quuxplusone opened 11 years ago

Quuxplusone commented 11 years ago
Bugzilla Link PR16326
Status NEW
Importance P normal
Reported by Jonathan Lennox (lennox@cs.columbia.edu)
Reported on 2013-06-13 19:46:39 -0700
Last modified on 2017-11-13 09:05:59 -0800
Version trunk
Hardware PC All
CC codeman.consulting@gmail.com, echristo@gmail.com, llvm-bugs@lists.llvm.org, mcrosier@codeaurora.org
Fixed by commit(s)
Attachments reg-overload-small.c (725 bytes, application/octet-stream)
reg-overload-large.c (803 bytes, application/octet-stream)
Blocks PR27197
Blocked by
See also
Created attachment 10677
File that (nearly) overloads LLVM's ARM registers.

The register allocation to constraints to GCC inline assembly is very poor,
leading to inefficient code, or in extreme cases to compilation failures.

Consider reg-alllocate-small.c (attached).

LLVM generates the following code.  Notice the shuffling of values between the
stack and registers before the function.

_foo:
@ BB#0:                                 @ %entry
    push    {r4, r5, r6, r7, lr}
    add r7, sp, #12
    push.w  {r8, r10, r11}
    sub sp, #32
    add.w   lr, r7, #8
    add.w   r8, sp, #12
    ldm.w   lr, {r4, r9, r12, lr}
    add.w   r10, sp, #16
    add.w   r11, sp, #20
    str.w   r12, [sp, #4]
    add.w   r12, sp, #4
    str.w   lr, [sp]
    add.w   lr, sp, #8
    add r6, sp, #24
    add r5, sp, #28
    str.w   r9, [sp, #8]
    mov r9, sp
    str r0, [sp, #28]
    str r1, [sp, #24]
    str r2, [sp, #20]
    str r3, [sp, #16]
    str r4, [sp, #12]
    @ InlineAsm Start
    ldr r0, [r5]
    ldr r1, [r0]
    ldr r2, [r6]
    ldr r3, [r11]
    ldr r4, [r10]
    add r1, r1, r2
    add r1, r1, r3
    add r1, r1, r4
    ldr r2, [r8]
    ldr r3, [lr]
    ldr r4, [r12]
    add r1, r1, r2
    add r1, r1, r3
    add r1, r1, r4
    ldr r2, [r9]
    add r1, r1, r2
    str r1, [r0]

    @ InlineAsm End
    add sp, #32
    pop.w   {r8, r10, r11}
    pop {r4, r5, r6, r7, pc}

gcc-4.2, by contrast, generates:

_foo:
    @ args = 16, pretend = 0, frame = 16
    @ frame_needed = 0, uses_anonymous_args = 0
    @ link register save eliminated.
    push    {r4}
    sub sp, sp, #16
    @ lr needed for prologue
    str r0, [sp, #12]
    str r1, [sp, #8]
    str r2, [sp, #4]
    str r3, [sp]
    ldr r0, [sp, #12]
    ldr r1, [r0]
    ldr r2, [sp, #8]
    ldr r3, [sp, #4]
    ldr r4, [sp]
    add r1, r1, r2
    add r1, r1, r3
    add r1, r1, r4
    ldr r2, [sp, #20]
    ldr r3, [sp, #24]
    ldr r4, [sp, #28]
    add r1, r1, r2
    add r1, r1, r3
    add r1, r1, r4
    ldr r2, [sp, #32]
    add r1, r1, r2
    str r1, [r0]

    add sp, sp, #16
    pop {r4}
    bx  lr

If we change the inline assembly to add one more register, as in reg-overload-
large.c, LLVM fails to compile the code entirely:

$ clang -O3 -c -arch armv7 -save-temps reg-overload-large.c
reg-overload-large.c:5:4: error: ran out of registers during register allocation
   "ldr r0, %[val]\n\t"
   ^
1 error generated.

gcc-4.2 also compiles this code perfectly well.

This is clang tip:

clang -v
clang version 3.4 (trunk 183951) (llvm/trunk 183950)
Target: x86_64-apple-darwin12.4.0
Thread model: posix
Quuxplusone commented 11 years ago

Attached reg-overload-small.c (725 bytes, application/octet-stream): File that (nearly) overloads LLVM's ARM registers.

Quuxplusone commented 11 years ago

Attached reg-overload-large.c (803 bytes, application/octet-stream): File that overloads LLVM's ARM registers entirely.

Quuxplusone commented 11 years ago

Thanks, Jonathan. I have not had time to dig into this, but at first glance it looks like your specifying an 'm' constraint, but expecting the compiler to directly use the input registers. I believe clang is "doing the right thing" here.

If you use an 'r' constraint do you get the output you expect?

Quuxplusone commented 11 years ago
Some of the inputs are in registers, but others are in sp-relative memory.

Basically, for the args on the stack, rather than passing [sp, #28] to satisfy
my "m" constraint (as GCC does), LLVM is doing add.w r5, sp, #28, and then
passing [r12] for my constraint.

So I suppose all the constraints should be "rm", to give the compiler the most
flexibility.

When I do that, clang's output is about the same:

_foo:
@ BB#0:                                 @ %entry
        push    {r4, r5, r6, r7, lr}
        add     r7, sp, #12
        push.w  {r8, r10, r11}
        sub     sp, #32
        add.w   lr, r7, #8
        add.w   r8, sp, #12
        ldm.w   lr, {r4, r9, r12, lr}
        add.w   r10, sp, #16
        add.w   r11, sp, #20
        str     r0, [sp, #28]
        add     r6, sp, #24
        str     r1, [sp, #24]
        add     r5, sp, #28
        str     r2, [sp, #20]
        str     r3, [sp, #16]
        str     r4, [sp, #12]
        str.w   r9, [sp, #8]
        mov     r9, sp
        str.w   r12, [sp, #4]
        add.w   r12, sp, #4
        str.w   lr, [sp]
        add.w   lr, sp, #8
        @ InlineAsm Start
        ldr r0, [r5]
        ldr r1, [r0]
        ldr r2, [r6]
        ldr r3, [r11]
        ldr r4, [r10]
        add r1, r1, r2
        add r1, r1, r3
        add r1, r1, r4
        ldr r2, [r8]
        ldr r3, [lr]
        ldr r4, [r12]
        add r1, r1, r2
        add r1, r1, r3
        add r1, r1, r4
        ldr r2, [r9]
        add r1, r1, r2
        str r1, [r0]

        @ InlineAsm End
        add     sp, #32
        pop.w   {r8, r10, r11}
        pop     {r4, r5, r6, r7, pc}

Where's gcc's is even better than before:

_foo:
        @ args = 16, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        push    {r4, r5, lr}
        mov     r5, r0
        mov     r9, r1
        mov     lr, r2
        mov     ip, r3
        ldr r0, r5
        ldr r1, [r0]
        ldr r2, r9
        ldr r3, lr
        ldr r4, ip
        add r1, r1, r2
        add r1, r1, r3
        add r1, r1, r4
        ldr r2, [sp, #12]
        ldr r3, [sp, #16]
        ldr r4, [sp, #20]
        add r1, r1, r2
        add r1, r1, r3
        add r1, r1, r4
        ldr r2, [sp, #24]
        add r1, r1, r2
        str r1, [r0]

        pop     {r4, r5, pc}
        .subsections_via_symbols

Even with this change (or changing all the constraints to "r" only), in the
"large" case clang still fails to compile the code.
Quuxplusone commented 7 years ago
Setting the inputs as "=m", outputs to "" and removing extraneous clobbers (the
listed registers are expected not to survive anyway).  Register uses remained
down and since those arguments weren't marked as outputs they were never saved
on the stack.

foo:
    .fnstart
@ BB#0:
    push    {r4, r5, r6, r10, r11, lr}
    add r11, sp, #16
    sub sp, sp, #36
    str r0, [r11, #-20]
    mov r12, sp
    ldr r0, [r11, #8]
    add lr, sp, #4
    str r1, [r11, #-24]
    add r1, sp, #20
    str r2, [sp, #24]
    add r2, sp, #8
    str r0, [sp, #16]
    add r4, sp, #24
    ldr r0, [r11, #12]
    sub r5, r11, #24
    str r3, [sp, #20]
    add r3, sp, #12
    sub r6, r11, #20
    str r0, [sp, #12]
    ldr r0, [r11, #16]
    str r0, [sp, #8]
    ldr r0, [r11, #20]
    str r0, [sp, #4]
    ldr r0, [r11, #24]
    str r0, [sp]
    add r0, sp, #16
    @APP
    ldr r0, [r6]
    ldr r1, [r0]
    ldr r2, [r5]
    ldr r3, [r4]
    ldr r4, [r1]
    add r1, r1, r2
    add r1, r1, r3
    add r1, r1, r4
    ldr r2, [r0]
    ldr r3, [r3]
    ldr r4, [r2]
    add r1, r1, r2
    add r1, r1, r3
    add r1, r1, r4
    ldr r2, [lr]
    ldr r3, [r12]
    add r1, r1, r2
    add r1, r1, r3
    str r1, [r0]

    @NO_APP
    sub sp, r11, #16
    pop {r4, r5, r6, r10, r11, pc}
.Lfunc_end0:

I managed to get rid of use of most of the upper registers being used, but now
it's using a frame pointer.   I managed to make it worse at least.  :-)
Quuxplusone commented 7 years ago

From what I can tell, at least, the GCC side has no more idea of what's going on here than we do. We ignore r7 and r11 entirely in the input / output list if a frame pointer is being used, and while turning on -fomit-frame-pointer fixes this, it just allocates space past SP and uses that as a second stack frame instead. Good time.