llvm / llvm-project

The LLVM Project is a collection of modular and reusable compiler and toolchain technologies.
http://llvm.org
Other
27.84k stars 11.47k forks source link

[ARM] divmod decomposition prevents __aeabi_idivmod #58212

Open easyaspi314 opened 1 year ago

easyaspi314 commented 1 year ago

LLVM will always decompose div + rem on ARM to div + mul + sub when optimizations are enabled.

This makes sense on targets with division, but on targets without division it breaks the conversion to __aeabi_[u]idivmod:

define void @divmod(i32 %num, i32 %den, ptr %out0)  {
  %quo = udiv i32 %num, %den
  %rem = urem i32 %num, %den
  store i32 %quo, i32 * %out0, align 4
  %out1 = getelementptr i32 *, ptr %out0, i32 1
  store i32 %rem, i32 * %out1, align 4
  ret void
}

With --target=armv5te-none-eabi -O0 -fomit-frame-pointer:

divmod:
    push    {r11, lr}
    sub     sp, sp, #8
    str     r2, [sp, #4]
    bl      __aeabi_uidivmod
    ldr     r2, [sp, #4]
    str     r0, [r2]
    str     r1, [r2, #4] 
    add     sp, sp, #8 
    pop     {r11, pc}

With --target=armv5te-none-eabi -O3:

divmod:
    push    {r4, r5, r6, lr}
    mov     r4, r2
    mov     r5, r1
    mov     r6, r0
    bl      __aeabi_uidiv
    mul     r1, r0, r5
    sub     r1, r6, r1
    stm     r4, {r0, r1}
    pop     {r4, r5, r6, pc}

This is because llvm will "decompose" the div+rem into this:

define void @divmod(i32 %num, i32 %den, ptr nocapture writeonly %out0) local_unnamed_addr #0 {
  %num.frozen = freeze i32 %num
  %den.frozen = freeze i32 %den
  %quo = udiv i32 %num.frozen, %den.frozen
  %1 = mul i32 %quo, %den.frozen
  %rem.decomposed = sub i32 %num.frozen, %1
  store i32 %quo, ptr %out0, align 4
  %out1 = getelementptr ptr, ptr %out0, i32 1
  store i32 %rem.decomposed, ptr %out1, align 4
  ret void
}

If this optimization didn't occur it would emit something much cleaner, without spilling r0 and r1.

divmod:
    push    {r4, lr}
    mov     r4, r2
    bl      __aeabi_uidivmod
    stm     r4!, {r0, r1}
    pop     {r4, pc}

Additionally, udiv+urem without optimizations already generates udiv+mls on targets with idiv, so this optimization pass is detrimental either way unless it can be turned into shift+add (?)

llvmbot commented 1 year ago

@llvm/issue-subscribers-backend-arm

easyaspi314 commented 1 year ago

This also happens with 64-bit arithmetic which is ridiculous, because there is only __aeabi_[u]ldivmod and it is 100% redundant since either way you get a modulo

divmod64:
    push    {r4, r5, r6, r7, r11, lr}
    mov     r4, r3
    mov     r5, r2
    mov     r6, r1
    mov     r7, r0
    bl      __aeabi_uldivmod
    umull   r2, r3, r0, r5
    mla     r3, r0, r4, r3
    ldr     r4, [sp, #24]
    subs    r2, r7, r2
    mla     r3, r1, r5, r3
    sbc     r3, r6, r3
    stm     r4, {r0, r1, r2, r3}
    pop     {r4, r5, r6, r7, r11, pc}

Vs.

divmod64:
    push    {r4, lr}
    bl      __aeabi_uldivmod
    ldr     r4, [sp, #8]
    stm     r4, {r0, r1, r2, r3}
    pop     {r4, pc}