RISC-V generates worse code than AArch64 for simple memset style loop at -Os

hiraditya commented 1 year ago

Derived from: https://github.com/llvm/llvm-project/issues/66652

#include<stdint.h>
#include<stddef.h>
void fill_i16(int16_t* a, int16_t v, size_t l) {
  for (size_t i = 0; i < l; i++) a[i] = v;
}

riscv-clang -Os -march=rv64gcv_zba_zbb_zbs

fill_i16:                               # @fill_i16
        beqz    a2, .LBB0_5
        not     a4, a2
        csrr    a7, vlenb
        bgeu    a4, a7, .LBB0_3
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        sh      a1, 0(a0)
        addi    a2, a2, -1
        addi    a0, a0, 2
        bnez    a2, .LBB0_2
        j       .LBB0_5
.LBB0_3:
        li      a4, 0
        srli    a6, a7, 3
        neg     a5, a7
        add     a3, a7, a2
        addi    a3, a3, -1
        and     a5, a5, a3
        vsetvli a3, zero, e16, m2, ta, ma
        vmv.v.x v8, a1
        slli    a1, a6, 4
        vsetvli zero, zero, e64, m8, ta, ma
        vid.v   v16
.LBB0_4:                                # =>This Inner Loop Header: Depth=1
        vsaddu.vx       v24, v16, a4
        vmsltu.vx       v0, v24, a2
        vse16.v v8, (a0), v0.t
        add     a4, a4, a7
        add     a0, a0, a1
        bne     a5, a4, .LBB0_4
.LBB0_5:
        ret

arm-clang -Os -march=armv8-a+sve

fill_i16:                               // @fill_i16
        cbz     x2, .LBB0_3
        cnth    x8
        mov     z0.h, w1
        mov     x10, xzr
        subs    x9, x2, x8
        csel    x9, xzr, x9, lo
        whilelo p0.h, xzr, x2
.LBB0_2:                                // =>This Inner Loop Header: Depth=1
        st1h    { z0.h }, p0, [x0, x10, lsl #1]
        whilelo p0.h, x10, x9
        add     x10, x10, x8
        b.mi    .LBB0_2
.LBB0_3:
        ret

hiraditya commented 1 year ago

cc: @alexey-bataev @nikolaypanchenko

llvmbot commented 1 year ago

@llvm/issue-subscribers-backend-risc-v

Derived from: https://github.com/llvm/llvm-project/issues/66652 ```cpp #include<stdint.h> #include<stddef.h> void fill_i16(int16_t* a, int16_t v, size_t l) { for (size_t i = 0; i < l; i++) a[i] = v; } ``` riscv-clang -Os -march=rv64gcv_zba_zbb_zbs ```asm fill_i16: # @fill_i16 beqz a2, .LBB0_5 not a4, a2 csrr a7, vlenb bgeu a4, a7, .LBB0_3 .LBB0_2: # =>This Inner Loop Header: Depth=1 sh a1, 0(a0) addi a2, a2, -1 addi a0, a0, 2 bnez a2, .LBB0_2 j .LBB0_5 .LBB0_3: li a4, 0 srli a6, a7, 3 neg a5, a7 add a3, a7, a2 addi a3, a3, -1 and a5, a5, a3 vsetvli a3, zero, e16, m2, ta, ma vmv.v.x v8, a1 slli a1, a6, 4 vsetvli zero, zero, e64, m8, ta, ma vid.v v16 .LBB0_4: # =>This Inner Loop Header: Depth=1 vsaddu.vx v24, v16, a4 vmsltu.vx v0, v24, a2 vse16.v v8, (a0), v0.t add a4, a4, a7 add a0, a0, a1 bne a5, a4, .LBB0_4 .LBB0_5: ret ``` arm-clang -Os -march=armv8-a+sve ```asm fill_i16: // @fill_i16 cbz x2, .LBB0_3 cnth x8 mov z0.h, w1 mov x10, xzr subs x9, x2, x8 csel x9, xzr, x9, lo whilelo p0.h, xzr, x2 .LBB0_2: // =>This Inner Loop Header: Depth=1 st1h { z0.h }, p0, [x0, x10, lsl #1] whilelo p0.h, x10, x9 add x10, x10, x8 b.mi .LBB0_2 .LBB0_3: ret ```

dzaima commented 1 year ago

For reference, here's a version of the function in question, utilizing VL: (written using intrinsics, compiler explorer)

fill_i16:                               # @fill_i16
        vsetvli a3, zero, e16, m2, ta, ma
        beqz    a2, .LBB0_3
        vmv.v.x v8, a1
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        vsetvli a1, a2, e16, m2, ta, ma
        vse16.v v8, (a0)
        sub     a2, a2, a1
        sh1add  a0, a1, a0
        bnez    a2, .LBB0_2
.LBB0_3:
        ret

Besides the autovectorizer not knowing how to utilize VL (it just does masking; I belive https://reviews.llvm.org/D99750 is work towards VL usage?), it also has a scalar path even though the vectorized path can handle any length input by itself.

edit: perhaps this is a slightly better way to do this, making the initial broadcast not needlessly always be VL=VLMAX, but just what's needed for the first/all, iterations. But this adds a dependency on the length, probably slightly increasing latency; tradeoffs.

nikolaypanchenko commented 1 year ago

@alexey-bataev 's patch (https://reviews.llvm.org/D99750) should be sufficient as is for this code, however loops that have arithmetic operations compiler will emit lots of vsetvlis. Follow up patches will help to clean them up and eventually we should be able to generate this code:

# %bb.1:                                # %vector.ph                                                                                                                                                                                                                                                                                                                                                                                                                                                   
  li  a3, 0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
  vsetvli a4, zero, e16, m1, ta, ma                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
  vmv.v.x v8, a1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
.LBB0_2:                                # %vector.body                                                                                                                                                                                                                                                                                                                                                                                                                                                 
                                        # =>This Inner Loop Header: Depth=1                                                                                                                                                                                                                                                                                                                                                                                                                            
  sub a1, a2, a3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
  vsetvli a1, a1, e16, m1, ta, ma                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
  sh1add  a4, a3, a0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
  add a3, a3, a1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
  vse16.v v8, (a4)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
  bne a3, a2, .LBB0_2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  
.LBB0_3:                                # %for.cond.cleanup                                                                                                                                                                                                                                                                                                                                                                                                                                            
  ret

(code is generated by our compiler with no remainder loop)

asb commented 4 months ago

I think the memset_pattern line of work should provide a path to generating simple code for this type of loop.

hiraditya commented 4 months ago

I think the memset_pattern line of work should provide a path to generating simple code for this type of loop.

Yeah, that should fix this issue. I believe the patch is here: #97583 ?

hiraditya commented 4 months ago

Cc: @topperc FYI.

llvm / llvm-project

RISC-V generates worse code than AArch64 for simple memset style loop at -Os #67595