fix vector mode support problem when V/P coexist

The current impl cannot use rvp instructions since it is not allowed vector mode data to be put on GPR, see https://github.com/pz9115/riscv-gcc/blob/riscv-gcc-experimental-v/gcc/config/riscv/riscv.c#L4986-#L4997 for more info. (this PR also includes a RVP fix from commit 8a51f69b54fab099c5cd1f8e54ded052f0c79a55)

Here is a test case to reproduce the error.

test_rvp.c

#include <rvp_intrinsic.h>
#include <stdlib.h>
#include <stdint.h>

typedef short int16x4_t __attribute__((vector_size (8)));
typedef unsigned short uint16x4_t __attribute__((vector_size (8)));

static __attribute__ ((noinline))
uint32_t dda (uint32_t ra, uint32_t rb)
{
  return __rv__add16 (ra, rb);
}

static __attribute__ ((noinline))
uint16x4_t ddau_v (uint16x4_t ra, uint16x4_t rb)
{
  return __rv__v_uadd16 (ra, rb);
}

static __attribute__ ((noinline))
int16x4_t ddas_v (int16x4_t ra, int16x4_t rb)
{
  return __rv__v_sadd16 (ra, rb);
}

This patch can fix this problem, and make rvp and rvv intrinsic to be able to coexist and use. Test cases I used(build config --with-arch=rv64gcv_zpn_zpsf --with-abi=lp64d ) for rvv are from https://github.com/riscv-non-isa/rvv-intrinsic-doc/tree/master/examples:

common.h

#include <math.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

void gen_rand_1d(double *a, int n) {
  for (int i = 0; i < n; ++i)
    a[i] = (double)rand() / (double)RAND_MAX + (double)(rand() % 1000);
}

void gen_string(char *s, int n) {
  // char value range: -128 ~ 127
  for (int i = 0; i < n - 1; ++i)
    s[i] = (char)(rand() % 127) + 1;
  s[n - 1] = '\0';
}

void gen_rand_2d(double **ar, int n, int m) {
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < m; ++j)
      ar[i][j] = (double)rand() / (double)RAND_MAX + (double)(rand() % 1000);
}

void print_string(const char *a, const char *name) {
  printf("const char *%s = \"", name);
  int i = 0;
  while (a[i] != 0)
    putchar(a[i++]);
  printf("\"\n");
  puts("");
}

void print_array_1d(double *a, int n, const char *type, const char *name) {
  printf("%s %s[%d] = {\n", type, name, n);
  for (int i = 0; i < n; ++i) {
    printf("%06.2f%s", a[i], i != n - 1 ? "," : "};\n");
    if (i % 10 == 9)
      puts("");
  }
  puts("");
}

void print_array_2d(double **a, int n, int m, const char *type,
                    const char *name) {
  printf("%s %s[%d][%d] = {\n", type, name, n, m);
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < m; ++j) {
      printf("%06.2f", a[i][j]);
      if (j == m - 1)
        puts(i == n - 1 ? "};" : ",");
      else
        putchar(',');
    }
  }
  puts("");
}

bool double_eq(double golden, double actual, double relErr) {
  return (fabs(actual - golden) < relErr);
}

bool compare_1d(double *golden, double *actual, int n) {
  for (int i = 0; i < n; ++i)
    if (!double_eq(golden[i], actual[i], 1e-6))
      return false;
  return true;
}

bool compare_string(const char *golden, const char *actual, int n) {
  for (int i = 0; i < n; ++i)
    if (golden[i] != actual[i])
      return false;
  return true;
}

bool compare_2d(double **golden, double **actual, int n, int m) {
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < m; ++j)
      if (!double_eq(golden[i][j], actual[i][j], 1e-6))
        return false;
  return true;
}

double **alloc_array_2d(int n, int m) {
  double **ret;
  ret = (double **)malloc(sizeof(double *) * n);
  for (int i = 0; i < n; ++i)
    ret[i] = (double *)malloc(sizeof(double) * m);
  return ret;
}

void init_array_one_1d(double *ar, int n) {
  for (int i = 0; i < n; ++i)
    ar[i] = 1;
}

void init_array_one_2d(double **ar, int n, int m) {
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < m; ++j)
      ar[i][j] = 1;
}

test_rvv.c

#include "common.h"
#include <riscv_vector.h>

// index arithmetic
void index_golden(double *a, double *b, double *c, int n) {
  for (int i = 0; i < n; ++i) {
    a[i] = b[i] + (double)i * c[i];
  }
}

void index_(double *a, double *b, double *c, int n) {
  size_t vlmax = vsetvlmax_e32m1();
  vuint32m1_t vec_i = vid_v_u32m1(vlmax);
  for (size_t vl; n > 0; n -= vl, a += vl, b += vl, c += vl) {
    vl = vsetvl_e64m2(n);

    vfloat64m2_t vec_i_double = vfwcvt_f_xu_v_f64m2(vec_i, vl);

    vfloat64m2_t vec_b = vle64_v_f64m2(b, vl);
    vfloat64m2_t vec_c = vle64_v_f64m2(c, vl);

    vfloat64m2_t vec_a =
        vfadd_vv_f64m2(vec_b, vfmul_vv_f64m2(vec_c, vec_i_double, vl), vl);
    vse64_v_f64m2(a, vec_a, vl);

    vec_i = vadd_vx_u32m1(vec_i, vl, vl);
  }
}

int main() {
  const int N = 31;
  const uint32_t seed = 0xdeadbeef;
  srand(seed);

  // data gen
  double B[N], C[N];
  gen_rand_1d(B, N);
  gen_rand_1d(C, N);

  // compute
  double golden[N], actual[N];
  index_golden(golden, B, C, N);
  index_(actual, B, C, N);

  // compare
  puts(compare_1d(golden, actual, N) ? "pass" : "fail");
}

test_rvv2.c

#include "common.h"
#include <riscv_vector.h>

// matrix multiplication
// A[n][o], B[m][o] --> C[n][m];
void matmul_golden(double **a, double **b, double **c, int n, int m, int o) {
  for (int i = 0; i < n; ++i)
    for (int j = 0; j < m; ++j) {
      c[i][j] = 0;
      for (int k = 0; k < o; ++k)
        c[i][j] += a[i][k] * b[j][k];
    }
}

void matmul(double **a, double **b, double **c, int n, int m, int o) {
  size_t vlmax = vsetvlmax_e64m1();
  for (int i = 0; i < n; ++i) {
    for (int j = 0; j < m; ++j) {
      double *ptr_a = &a[i][0];
      double *ptr_b = &b[j][0];
      int k = o;
      vfloat64m1_t vec_s = vfmv_v_f_f64m1(0, vlmax);
      vfloat64m1_t vec_zero = vfmv_v_f_f64m1(0, vlmax);
      for (size_t vl; k > 0; k -= vl, ptr_a += vl, ptr_b += vl) {
        vl = vsetvl_e64m1(k);

        vfloat64m1_t vec_a = vle64_v_f64m1(ptr_a, vl);
        vfloat64m1_t vec_b = vle64_v_f64m1(ptr_b, vl);

        vec_s = vfmacc_vv_f64m1(vec_s, vec_a, vec_b, vl);
      }

      vfloat64m1_t vec_sum;
      vec_sum = vfredsum_vs_f64m1_f64m1(vec_zero, vec_s, vec_zero, vlmax);
      double sum = vfmv_f_s_f64m1_f64(vec_sum);
      c[i][j] = sum;
    }
  }
}

int main() {
  const int N = 8;
  const int M = 8;
  const int O = 7;
  uint32_t seed = 0xdeadbeef;
  srand(seed);

  // data gen
  double **A = alloc_array_2d(N, O);
  double **B = alloc_array_2d(M, O);
  gen_rand_2d(A, N, O);
  gen_rand_2d(B, M, O);

  // compute
  double **golden = alloc_array_2d(N, M);
  double **actual = alloc_array_2d(N, M);
  matmul_golden(A, B, golden, N, M, O);
  matmul(A, B, actual, N, M, O);

  // compare
  puts(compare_2d(golden, actual, N, M) ? "pass" : "fail");
}

pz9115 / riscv-gcc

fix vector mode support problem when V/P coexist #18