riscv-non-isa / riscv-toolchain-conventions

Documenting the expected behaviour and supported command-line switches for GNU and LLVM based RISC-V toolchains
https://jira.riscv.org/browse/RVG-4
Creative Commons Attribution 4.0 International
144 stars 34 forks source link

Is it possible that we can add -mprefer-agnostic compile option for VSETVL optimization of RVV? #37

Open zhongjuzhe opened 1 year ago

zhongjuzhe commented 1 year ago

Consider this following case:

https://godbolt.org/z/oTWvrsGhE

Code:

void f (void * restrict in, void * restrict out, int n, int cond)
{
  size_t vl = 101;
  for (size_t i = 0; i < n; i++)
    {
      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
      __riscv_vse8_v_i8mf8 (out + i, v, vl);
    }

  for (size_t i = 0; i < n; i++)
    {
      vuint8mf8_t index = __riscv_vle8_v_u8mf8 (in + i + 300, vl);
      vfloat32mf2_t v = __riscv_vle32_v_f32mf2 (in + i + 600, vl);
      v = __riscv_vle32_v_f32mf2_tu (v, in + i + 800, vl);
      __riscv_vsoxei8_v_f32mf2 (out + i + 200, index, v, vl);
    }
}

GCC by default enable VTYPE && POLICY fusion of vsetvli as long as they are compatible:

f:
        beq     a2,zero,.L1
        li      a3,101
        mv      a4,a1
        add     a6,a0,a2
        mv      a5,a0
        vsetvli zero,a3,e32,mf2,tu,ma
.L3:
        vle8.v  v1,0(a5)
        addi    a5,a5,1
        vse8.v  v1,0(a4)
        addi    a4,a4,1
        bne     a5,a6,.L3
        addi    a0,a0,300
        addi    a1,a1,200
        add     a2,a0,a2
.L5:
        addi    a4,a0,300
        vle32.v v1,0(a4)
        addi    a4,a0,500
        vle8.v  v2,0(a0)
        vle32.v v1,0(a4)
        addi    a0,a0,1
        vsoxei8.v       v1,(a1),v2
        addi    a1,a1,1
        bne     a2,a0,.L5
.L1:
        ret

I believe most of the cases, that GCC codegen is better.

However, for some vendor RVV CPU which has vector register renaming && vsetvli special optimization (vsetvli execution latency almost consume 0 cycle most of the time), I believe this following codegen is better:

f:
        beq     a2,zero,.L1
        li      a3,101
        mv      a4,a1
        add     a6,a0,a2
        mv      a5,a0
        vsetvli zero,a3,e32,mf2,tu,ma
.L3:
        vle8.v  v1,0(a5)
        addi    a5,a5,1
        vse8.v  v1,0(a4)
        addi    a4,a4,1
        bne     a5,a6,.L3
        addi    a0,a0,300
        addi    a1,a1,200
        add     a2,a0,a2
.L5:
       vsetvli zero, a3, e8, mf8, ta, ma
        addi    a4,a0,300
        vle32.v v1,0(a4)
        addi    a4,a0,500
        vle8.v  v2,0(a0)
       vsetvli zero, zero, e32, mf2, tu, ma
        vle32.v v1,0(a4)
        addi    a0,a0,1
        vsoxei8.v       v1,(a1),v2
        addi    a1,a1,1
        bne     a2,a0,.L5
.L1:
        ret

I think fusing VTYPE is always optimal, for example: https://godbolt.org/z/dfx93jzrv

code:

void f (void * restrict in, void * restrict out, int n, int cond)
{
  size_t vl = 101;
  for (size_t i = 0; i < n; i++)
    {
      vint8mf8_t v = __riscv_vle8_v_i8mf8 (in + i, vl);
      __riscv_vse8_v_i8mf8 (out + i, v, vl);
    }

  for (size_t i = 0; i < n; i++)
    {
      vuint8mf8_t index = __riscv_vle8_v_u8mf8 (in + i + 300, vl);
      vfloat32mf2_t v = __riscv_vle32_v_f32mf2 (in + i + 600, vl);
      __riscv_vsoxei8_v_f32mf2 (out + i + 200, index, v, vl);
    }
}

optimal codegen:

f:
        beq     a2,zero,.L1
        li      a3,101
        mv      a4,a1
        add     a6,a2,a0
        mv      a5,a0
        vsetvli zero,a3,e32,mf2,ta,ma
.L3:
        vle8.v  v1,0(a5)
        addi    a5,a5,1
        vse8.v  v1,0(a4)
        addi    a4,a4,1
        bne     a5,a6,.L3
        addi    a0,a0,300
        addi    a1,a1,200
        add     a2,a0,a2
.L5:
        addi    a4,a0,300
        vle8.v  v1,0(a0)
        vle32.v v2,0(a4)
        addi    a0,a0,1
        vsoxei8.v       v2,(a1),v1
        addi    a1,a1,1
        bne     a0,a2,.L5
.L1:
        ret

However, Policy fusion is not always the optimal, Is it resonable adding such compile option (-mprefer-agnostic) to disable tail Policy && mask policy fusion in vsetvli ?

Thanks

kito-cheng commented 1 year ago

That's highly depended on the uarch, so I would prefer just tie to -mtune like other cost model for GCC, but I think it's harmless to just add that in GCC first to see if that's useful, then implement to LLVM and then document that option here.

Personally I would prefer do not document those optimization option in this repo since those flags are compiler-dependent, and just document for necessary common interface here like -march, -mabi and -mcmodel here.

JeffreyALaw commented 1 year ago

Agreed. This is going to be dependent on multiple features of the uarch.

So I think the question is whether or not any such implementations exist or will exist in the near future. If not, then let's not complicate things right now. If it looks like such architectures are on the horizon, then we might as well be prepared for them.

I don't think this will affect Veyron V2.