Suboptimal code generation in arm64 pointer arithmetic

uncleasm commented 2 years ago

The problem is probably only seen on arm64 with the missed opportunity to use post-fix instructions for pointer access.

Given

#include <cstdint>
using v4 = float __attribute__((vector_size(16)));
v4* sum(v4 *src, v4 &dst, int64_t stride_xm, int64_t stride_xp) {
    auto a = *reinterpret_cast<v4 *>(reinterpret_cast<char *>(src) + stride_xm);
    auto c = *reinterpret_cast<v4 *>(reinterpret_cast<char *>(src) + stride_xp);
    auto b = *src++;
    auto d = *src++;
    dst = (a + c) + (b + d);
    return src;
}

we see good code with post-fix addressing ldp x,y,[x0],#32

sum(float __vector(4)*, float __vector(4)&, long, long):                      // @sum(float __vector(4)*, float __vector(4)&, long, long)
        ldr     q0, [x0, x2]
        ldr     q1, [x0, x3]
        ldp     q2, q3, [x0], #32
        fadd    v0.4s, v0.4s, v1.4s
        fadd    v1.4s, v2.4s, v3.4s
        fadd    v0.4s, v0.4s, v1.4s
        str     q0, [x1]
        ret

However, when this fragment is attempted to be used for (say 3 separate rows)

void convolution(v4 *src0, v4 *src1, v4 *src2, int64_t stride_xm, int64_t stride_xp, v4 *dst, int w) {
    do {
        v4 a,b,c;
        src0 = sum(src0, a, stride_xm, stride_xp);
        src1 = sum(src1, b, stride_xm, stride_xp);
        src2 = sum(src2, c, stride_xm, stride_xp);
        *dst++ = (a+b+c);
    } while (--w);
}

the code generator has chosen to allocate 9 registers for all these pointers

convolution(float __vector(4)*, float __vector(4)*, float __vector(4)*, long, long, float __vector(4)*, int):      // @convolution(float __vector(4)*, float __vector(4)*, float __vector(4)*, long, long, float __vector(4)*, int)
        mov     x8, xzr  // avoidable
        add     x9, x2, x4   // avoidable
        add     x10, x2, x3  // avoidable
        add     x11, x1, x4  // avoidable
        add     x12, x1, x3  // avoidable
        add     x13, x0, x3  // avoidable
        add     x14, x0, x4  // avoidable
.LBB2_1:                                // =>This Inner Loop Header: Depth=1
        ldr     q0, [x13, x8]
        add     x16, x1, x8  // avoidable
        ldr     q1, [x14, x8]
        add     x15, x0, x8  // avoidable
        ldr     q4, [x12, x8]
        subs    w6, w6, #1
        ldr     q5, [x11, x8]
        ldp     q2, q3, [x15]
        fadd    v0.4s, v0.4s, v1.4s
        add     x15, x2, x8  // avoidable
        fadd    v4.4s, v4.4s, v5.4s
        ldp     q6, q1, [x16]
        fadd    v2.4s, v2.4s, v3.4s
        fadd    v0.4s, v0.4s, v2.4s
        fadd    v1.4s, v6.4s, v1.4s
        ldr     q3, [x10, x8]
        ldp     q5, q6, [x15]
        fadd    v1.4s, v4.4s, v1.4s
        ldr     q7, [x9, x8]
        add     x8, x8, #32  // avoidable
        fadd    v0.4s, v0.4s, v1.4s
        fadd    v2.4s, v3.4s, v7.4s
        fadd    v3.4s, v5.4s, v6.4s
        fadd    v2.4s, v2.4s, v3.4s
        fadd    v0.4s, v0.4s, v2.4s
        str     q0, [x5], #16
        b.ne    .LBB2_1
        ret

There are some 11 avoidable instructions, which could be substituted to the ldr vector_reg, [base_reg, offset_reg] and the post-fix ldp vec0, vec1, [base_reg], #32

llvmbot commented 2 years ago

@llvm/issue-subscribers-backend-aarch64

bcl5980 commented 2 years ago

We can disable lsr to workaround this issue. But I'm not sure how to fix it. https://godbolt.org/z/szcMK9sqh

bcl5980 commented 2 years ago

It looks ldp/stp don't support scaled addressing mode. So we need to add some code to avoid ldp/stp with Scale report legal in AArch64. I will try to add a patch to fix it later. https://reviews.llvm.org/D124014

Update: Abandon the patch as it is not correct. For now I have no more idea to fix this issue.

llvm / llvm-project

Suboptimal code generation in arm64 pointer arithmetic #53877