riscv-collab / riscv-gnu-toolchain

GNU toolchain for RISC-V, including GCC
Other
3.52k stars 1.16k forks source link

rvv-intrinsic opt issue at -march=rv64gcv0p7 #1106

Closed haolongzhangm closed 2 years ago

haolongzhangm commented 2 years ago
#include "riscv_vector.h"                                                                                                                                                                                     
#include <cstdio>                                                                                                                                                                                             

int main() {                                                                                                                                                                                                  

  float src[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};                                                                                                                                    

  vfloat32m1_t t0, t1, t2, t3, ret0, ret1;                                                                                                                                                                    

  t0 = vle32_v_f32m1(src, 4);                                                                                                                                                                                 
  t1 = vle32_v_f32m1(src + 4, 4);
  t2 = vle32_v_f32m1(src + 8, 4);
  t3 = vle32_v_f32m1(src + 12, 4);

  ret0 = vfadd_vv_f32m1(t0, t1, 4);
  ret1 = vfadd_vv_f32m1(t2, t3, 4);

  float dst [8] = {0};

  vse32_v_f32m1(dst, ret0, 4);
  vse32_v_f32m1(dst + 4, ret1, 4);

  for (size_t i = 0; i < 8; i++) {
      printf("%f ", dst[i]);
  }

  printf("\n");

  return 0;
}

   10508:       0087f7d7                vsetvli a5,a5,e32,m1,d1
   1050c:       181c                    addi    a5,sp,48
   1050e:       0207f207                vle.v   v4,(a5)
   10512:       1004                    addi    s1,sp,32
   10514:       009c                    addi    a5,sp,64
   10516:       0207f087                vle.v   v1,(a5)
   1051a:       0204f107                vle.v   v2,(s1)
   1051e:       089c                    addi    a5,sp,80
   10520:       0207f187                vle.v   v3,(a5)
   10524:       02221157                vfadd.vv        v2,v2,v4
   10528:       021190d7                vfadd.vv        v1,v1,v3
   1052c:       e802                    sd      zero,16(sp)
   1052e:       ec02                    sd      zero,24(sp)
   10530:       e002                    sd      zero,0(sp)
   10532:       e402                    sd      zero,8(sp)
   10534:       081c                    addi    a5,sp,16
   10536:       840a                    mv      s0,sp
   10538:       6941                    lui     s2,0x10
   1053a:       02017127                vse.v   v2,(sp)
   1053e:       0207f0a7                vse.v   v1,(a5)
   10542:       00042787                flw     fa5,0(s0) # ffffffffffffd000 <__global_pointer$+0xfffffffffffea800>
   10546:       67090513                addi    a0,s2,1648 # 10670 <__libc_csu_fini+0x4>
   1054a:       420787d3                fcvt.d.s        fa5,fa5
   1054e:       0411                    addi    s0,s0,4
   10550:       e20785d3                fmv.x.d a1,fa5
   10554:       f6dff0ef                jal     ra,104c0 <printf@plt>
   10558:       fe8495e3                bne     s1,s0,10542 <main+0x72>
   1055c:       4529                    li      a0,10
   1055e:       f53ff0ef                jal     ra,104b0 <putchar@plt>
   10562:       70e6                    ld      ra,120(sp)
   10564:       7446                    ld      s0,112(sp)
   10566:       74a6                    ld      s1,104(sp)
   10568:       7906                    ld      s2,96(sp)
   1056a:       4501                    li      a0,0
   1056c:       6109                    addi    sp

use vcreate_
int main() {

  float src[16] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};

  //vfloat32m1_t t0, t1, t2, t3, ret0, ret1;

  vfloat32m1x4_t src4;// = vcreate_f32m1x4(t0, t1, t2, t3);
  vfloat32m1x2_t dst2;// = vcreate_f32m1x2(ret0, ret1);

  src4 = vset_f32m1x4(src4, 0, vle32_v_f32m1(src, 4));
  src4 = vset_f32m1x4(src4, 1, vle32_v_f32m1(src + 4, 4));
  src4 = vset_f32m1x4(src4, 2, vle32_v_f32m1(src + 8, 4));
  src4 = vset_f32m1x4(src4, 3, vle32_v_f32m1(src + 12, 4));

  dst2 = vset_f32m1x2(dst2, 0, vfadd_vv_f32m1(vget_f32m1x4_f32m1(src4, 0), vget_f32m1x4_f32m1(src4, 1), 4));
  dst2 = vset_f32m1x2(dst2, 1, vfadd_vv_f32m1(vget_f32m1x4_f32m1(src4, 2), vget_f32m1x4_f32m1(src4, 3), 4));

  float dst [8] = {0};

  vse32_v_f32m1(dst, vget_f32m1x2_f32m1(dst2, 0), 4);
  vse32_v_f32m1(dst + 4, vget_f32m1x2_f32m1(dst2, 1), 4);

  for (size_t i = 0; i < 8; i++) {
      printf("%f ", dst[i]);
  }

  printf("\n");

  return 0;
}

10508:       0087f7d7                vsetvli a5,a5,e32,m1,d1                                                                                                                                               
   1050c:       1004                    addi    s1,sp,32                                                                                                                                                      
   1050e:       0204f087                vle.v   v1,(s1)                                                                                                                                                       
   10512:       c2002f73                csrr    t5,vl                                                                                                                                                         
   10516:       c2102ff3                csrr    t6,vtype                                                                                                                                                      
   1051a:       00807057                vsetvli zero,zero,e32,m1,d1                                                                                                                                           
   1051e:       5e008257                vmv.v.v v4,v1                                                                                                                                                         
   10522:       81ff7057                vsetvl  zero,t5,t6                                                                                                                                                    
   10526:       32002057                vmv.x.s zero,v0                                                                                                                                                       
   1052a:       181c                    addi    a5,sp,48                                                                                                                                                      
   1052c:       0207f087                vle.v   v1,(a5)                                                                                                                                                       
   10530:       c2002f73                csrr    t5,vl                                                                                                                                                         
   10534:       c2102ff3                csrr    t6,vtype                                                                                                                                                      
   10538:       00807057                vsetvli zero,zero,e32,m1,d1                                                                                                                                           
   1053c:       5e0082d7                vmv.v.v v5,v1                                                                                                                                                         
   10540:       81ff7057                vsetvl  zero,t5,t6                                                                                                                                                    
   10544:       32002057                vmv.x.s zero,v0                                                                                                                                                       
   10548:       009c                    addi    a5,sp,64                                                                                                                                                      
   1054a:       0207f087                vle.v   v1,(a5)                                                                                                                                                       
   1054e:       c2002f73                csrr    t5,vl                                                                                                                                                         
   10552:       c2102ff3                csrr    t6,vtype                                                                                                                                                      
   10556:       00807057                vsetvli zero,zero,e32,m1,d1                                                                                                                                           
   1055a:       5e008357                vmv.v.v v6,v1                                                                                                                                                         
   1055e:       81ff7057                vsetvl  zero,t5,t6                                                                                                                                                    
   10562:       32002057                vmv.x.s zero,v0                                                                                                                                                       
   10566:       089c                    addi    a5,sp,80
   10568:       0207f087                vle.v   v1,(a5)
   1056c:       e802                    sd      zero,16(sp)
   1056e:       c2002f73                csrr    t5,vl
   10572:       c2102ff3                csrr    t6,vtype
   10576:       00807057                vsetvli zero,zero,e32,m1,d1
   1057a:       5e0083d7                vmv.v.v v7,v1
   1057e:       81ff7057                vsetvl  zero,t5,t6
   10582:       32002057                vmv.x.s zero,v0
   10586:       ec02                    sd      zero,24(sp)
   10588:       c2002f73                csrr    t5,vl
   1058c:       c2102ff3                csrr    t6,vtype
   10590:       00807057                vsetvli zero,zero,e32,m1,d1
   10594:       5e020457                vmv.v.v v8,v4
   10598:       81ff7057                vsetvl  zero,t5,t6
   1059c:       32002057                vmv.x.s zero,v0
   105a0:       c2002f73                csrr    t5,vl
   105a4:       c2102ff3                csrr    t6,vtype
   105a8:       00807057                vsetvli zero,zero,e32,m1,d1
   105ac:       5e0284d7                vmv.v.v v9,v5
   105b0:       81ff7057                vsetvl  zero,t5,t6
   105b4:       32002057                vmv.x.s zero,v0
   105b8:       c2002f73                csrr    t5,vl
 105bc:       c2102ff3                csrr    t6,vtype                                                                                                                                                      
   105c0:       00807057                vsetvli zero,zero,e32,m1,d1                                                                                                                                           
   105c4:       5e0300d7                vmv.v.v v1,v6
   105c8:       81ff7057                vsetvl  zero,t5,t6
   105cc:       32002057                vmv.x.s zero,v0
   105d0:       c2002f73                csrr    t5,vl
   105d4:       c2102ff3                csrr    t6,vtype
   105d8:       00807057                vsetvli zero,zero,e32,m1,d1
   105dc:       5e0382d7                vmv.v.v v5,v7
   105e0:       81ff7057                vsetvl  zero,t5,t6
   105e4:       32002057                vmv.x.s zero,v0
   105e8:       081c                    addi    a5,sp,16
   105ea:       840a                    mv      s0,sp
   105ec:       6941                    lui     s2,0x10
   105ee:       02849257                vfadd.vv        v4,v8,v9
   105f2:       021290d7                vfadd.vv        v1,v1,v5
   105f6:       e002                    sd      zero,0(sp)
   105f8:       e402                    sd      zero,8(sp)
   105fa:       c2002f73                csrr    t5,vl
   105fe:       c2102ff3                csrr    t6,vtype
   10602:       00807057                vsetvli zero,zero,e32,m1,d1
   10606:       5e020157                vmv.v.v v2,v4
   1060a:       81ff7057                vsetvl  zero,t5,t6
   1060e:       32002057                vmv.x.s zero,v0
   10612:       c2002f73                csrr    t5,vl
   10616:       c2102ff3                csrr    t6,vtype
   1061a:       00807057                vsetvli zero,zero,e32,m1,d1
   1061e:       5e0081d7                vmv.v.v v3,v1
   10622:       81ff7057                vsetvl  zero,t5,t6
   10626:       32002057                vmv.x.s zero,v0
   1062a:       c2002f73                csrr    t5,vl
   1062e:       c2102ff3                csrr    t6,vtype
   10632:       00807057                vsetvli zero,zero,e32,m1,d1
   10636:       5e010257                vmv.v.v v4,v2
   1063a:       81ff7057                vsetvl  zero,t5,t6
   1063e:       32002057                vmv.x.s zero,v0
   10642:       c2002f73                csrr    t5,vl
   10646:       c2102ff3                csrr    t6,vtype
   1064a:       00807057                vsetvli zero,zero,e32,m1,d1
   1064e:       5e0180d7                vmv.v.v v1,v3
   10652:       81ff7057                vsetvl  zero,t5,t6
   10656:       32002057                vmv.x.s zero,v0
   1065a:       02017227                vse.v   v4,(sp)
   1065e:       0207f0a7                vse.v   v1,(a5)
   10662:       00042787                flw     fa5,0(s0) # ffffffffffffd000 <__global_pointer$+0xfffffffffffea800>
   10666:       79090513                addi    a0,s2,1936 # 10790 <__libc_csu_fini+0x4>
   1066a:       420787d3                fcvt.d.s        fa5,fa5
   1066e:       0411                    addi    s0,s0,4
   10670:       e20785d3                fmv.x.d a1,fa5
   10674:       e4dff0ef                jal     ra,104c0 <printf@plt>
haolongzhangm commented 2 years ago
zhongjuzhe commented 2 years ago

RVV0.7 is no longer maintained. The latest actively maintained RVV GCC is the branch "rvv-next" which support the official RVV1.0 intrinsics and auto-vectorization. Same situation for LLVM.

haolongzhangm commented 2 years ago

@zhongjuzhe but there are so many board just support v0.7 standard, if build with 1.0 , will crash caused by unkown instruct how to fix this

what is more, so many software always have LTS version, which means need fix issue on this version

eg. now latest ubuntu is 22.04, but community also fix issue find on 20.04,

what is more, linux kernel is the same logic

why riscv-gnu-toolchain do not?

just ask you use rvv-next or latest is not a good idea.

what is more, the requirement is not ask to modify rvv 0.7 doc-spec, is fix riscv-gnu-toolchain optimize issue

cmuellner commented 2 years ago

In general non-upstream branches (incl. the one mentioned for RVV 0.7) are not supported/maintained. The RVV 0.7 specification is not a ratified extension (but we all know it is implemented in available HW). Given the support for the ratified RVV 1.0 extension is not upstream as of now, there is no chance of getting RVV 0.7 upstream. Therefore nobody is looking at this at the moment.

haolongzhangm commented 2 years ago

no need release RVV 0.7 , just fix the optimize issue when user set -march=rv64gcv0p7 at latest release (rvv-next?)

I think this is a optpass issue, not RVV 0.7 issue

cmuellner commented 2 years ago

As I said, nobody is working with these branches anymore as everyone is working on branches that target upstream.

haolongzhangm commented 2 years ago

so "target upstream" can not do the fix?

kito-cheng commented 2 years ago

Not sure which toolchain you are usign, but currently we are targeting rvv 1.0 only for upstream, which is the FSF GCC, is not upstream yet, but that review in progress.

haolongzhangm commented 2 years ago

@kito-cheng may u can just test the test code with "-march=rv64gcv0p7" and "-march=rv64gcv"

kito-cheng commented 2 years ago

@haolongzhangm could you post your gcc version info by riscv64-unknown-elf-gcc --version? that sounds like a T-head toolchain?

haolongzhangm commented 2 years ago

@kito-cheng

riscv64-unknown-linux-gnu-g++ (Xuantie-900 linux-5.10.4 glibc gcc Toolchain V2.2.6 B-20220516) 10.2.0 Copyright (C) 2020 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

kito-cheng commented 2 years ago

Hmmm, I think you might report to T-head guys rather than report here, @rjiejie

haolongzhangm commented 2 years ago

@kito-cheng so thanks for you suggest

@rjiejie need i create a new issue or not ? or just follow this one

rjiejie commented 2 years ago

@kito-cheng so thanks for you suggest

@rjiejie need i create a new issue or not ? or just follow this one

Follow this is ok also, and we do some optimization for it. we will ship this optimization in version v2.6.1, you can download toolchain from https://occ.t-head.cn/community/download?id=4073475960903634944 if it released :)

haolongzhangm commented 2 years ago

@rjiejie I test Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.0-20220715.tar.gz , issue more severe

u can test the example code at the head of this issue

haolongzhangm commented 2 years ago

so any base test code for rvv api performance test?

haolongzhangm commented 2 years ago

what`s more, Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.0-20220715.tar.gz upgrade libc depends to GLIBC_2.33 ?

malaterre commented 2 years ago

The latest actively maintained RVV GCC is the branch "rvv-next" which support the official RVV1.0 intrinsics and auto-vectorization.

For some reason I can find the old rvv-0.7x branch but none seems to match "rvv-next":

Could someone please guide me on where is the latest branch (one in progress for merged in upstream GCC).

Thanks

zhongjuzhe commented 2 years ago

You should pull latest riscv gnu toolchain, there is no rvv-0.7x branch any more. The only RVV branch is called "rvv-next" in riscv-gnu-toolchain. And you also should pull the latest "riscv-gcc-rvv-next" branch in riscv-gcc.

The rvv-next support full feature RVV1.0. And it is going to merged in upstream GCC. I am working on splitting rvv-next into small patches to upstream GCC so that reviewers can easily review that. So far there are part of the rvv-next codes already merged in upstream GCC. But for full support, you still need to use rvv-next.

TommyMurphyTM1234 commented 2 years ago

Questions relating to non-standard pre-ratification v0.0.7 vector extension used/implemented in some RISC-V cores (see here for example: https://github.com/riscv/riscv-v-spec/issues/667) and how to build the latest rvv-next branch, answered.