Question about widening and reinterpreting inside a loop

tomhepworth commented 1 year ago

I want to vectorise the following function where I accumulate unsigned chars into an int. The width needs extending to avoid overflow and it needs to be a signed int for some other reasons.

int byte_mac(unsigned char a[], unsigned char b[], int len) {
  int sum = 0;
  for (int i = 0; i < len; i++) {
    sum += a[i] * b[i];
  }
  return sum;
}

In order to "cast" the values from u8 to i32 I am trying to use vzext and the reinterpret intrinsic. My implementation follows:

int byte_mac_vec(unsigned char *a, unsigned char *b, int len) {
  size_t vlmax = __riscv_vsetvlmax_e8m1();
  vint32m4_t vec_s = __riscv_vmv_v_x_i32m4(0, vlmax);
  vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vlmax);
  int k = len;
  for (size_t vl; k > 0; k -= vl, a += vl, b += vl) {
    vl = __riscv_vsetvl_e8m1(k);

    vuint8m1_t a8s = __riscv_vle8_v_u8m1(a, vl);
    vuint8m1_t b8s = __riscv_vle8_v_u8m1(b, vl);
    vuint32m4_t a8s_extended = __riscv_vzext_vf4_u32m4(a8s, vl);
    vuint32m4_t b8s_extended = __riscv_vzext_vf4_u32m4(a8s, vl);

    vint32m4_t a8s_as_i32 = __riscv_vreinterpret_v_u32m4_i32m4(a8s_extended);
    vint32m4_t b8s_as_i32 = __riscv_vreinterpret_v_u32m4_i32m4(b8s_extended);

    vec_s = __riscv_vmacc_vv_i32m4(vec_s, a8s_as_i32, b8s_as_i32, vl);
  }

  vint32m1_t vec_sum = __riscv_vredsum_vs_i32m4_i32m1(vec_s, vec_zero, vlmax);
  int sum = __riscv_vmv_x_s_i32m1_i32(vec_sum);

  return sum;
}

However I get different results for my byte_mac and byte_mac_vec functions. byte_mac_vec does not seem to be responding to a reduced vl in the final iterations of the loop, and reads a full vector of values even though (as I understand it from the various examples) if vl is reduced it should not read any further.

Am I doing something wrong here? Does the widening and reinterpreting somehow mess up the way the vector registers are filled?

Full code:

#include <math.h>
#include <riscv_vector.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#define N 50

int byte_mac(unsigned char a[], unsigned char b[], int len) {
  int sum = 0;
  for (int i = 0; i < len; i++) {
    sum += a[i] * b[i];
  }
  return sum;
}

int byte_mac_vec(unsigned char *a, unsigned char *b, int len) {
  size_t vlmax = __riscv_vsetvlmax_e8m1();
  vint32m4_t vec_s = __riscv_vmv_v_x_i32m4(0, vlmax);
  vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vlmax);
  int k = len;
  for (size_t vl; k > 0; k -= vl, a += vl, b += vl) {
    vl = __riscv_vsetvl_e8m1(k);

    vuint8m1_t a8s = __riscv_vle8_v_u8m1(a, vl);
    vuint8m1_t b8s = __riscv_vle8_v_u8m1(b, vl);
    vuint32m4_t a8s_extended = __riscv_vzext_vf4_u32m4(a8s, vl);
    vuint32m4_t b8s_extended = __riscv_vzext_vf4_u32m4(a8s, vl);

    vint32m4_t a8s_as_i32 = __riscv_vreinterpret_v_u32m4_i32m4(a8s_extended);
    vint32m4_t b8s_as_i32 = __riscv_vreinterpret_v_u32m4_i32m4(b8s_extended);

    vec_s = __riscv_vmacc_vv_i32m4(vec_s, a8s_as_i32, b8s_as_i32, vl);
  }

  vint32m1_t vec_sum = __riscv_vredsum_vs_i32m4_i32m1(vec_s, vec_zero, vlmax);
  int sum = __riscv_vmv_x_s_i32m1_i32(vec_sum);

  return sum;
}

int main() {
  unsigned char a[N] = {
      1, 2, 3,  4, 5, 6,  7, 8, 9, 10, 1, 2, 3,  4, 5, 6,  7,
      8, 9, 10, 1, 2, 3,  4, 5, 6, 7,  8, 9, 10, 1, 2, 3,  4,
      5, 6, 7,  8, 9, 10, 1, 2, 3, 4,  5, 6, 7,  8, 9, 10,
  };

  unsigned char b[N] = {
      1, 2, 3,  4, 5, 6,  7, 8, 8, 10, 1, 2, 3,  4, 5, 6,  7,
      8, 9, 10, 1, 2, 3,  4, 5, 6, 7,  8, 9, 10, 1, 2, 3,  4,
      5, 6, 7,  8, 9, 10, 1, 2, 3, 4,  5, 6, 7,  8, 9, 10,
  };

  int len = 50;

  int v1 = byte_mac(a, b, len);
  int v2 = byte_mac_vec(a, b, len);

  printf("%d %d\n", v1, v2);
  return 0;
}

topperc commented 1 year ago

Which compiler are you using? What execution environment are you using?

The __riscv_vmacc_vv_i32m4 in the loop needs to be the tail undisturbed version __riscv_vmacc_vv_i32m4_tu so that the upper elements on the last iteration are preserved from previous iterations.

The __riscv_vredsum_vs_i32m4_i32m1 after the loop should use __riscv_vsetvl_e32m4(len) as its vl to handle the case where len is less than vlmax for e32m4.

tomhepworth commented 1 year ago

Thanks for the fast response! :) I am compiling with llvm clang v17 and executing with spike --isa rv64gcv pk ...

I'm not sure I follow why it needs to be tail undisturbed. This is the first time I've used vector intrinsics like this so apologies if I am missing something obvious.

The updated function below still has the same issue

int byte_mac_vec(unsigned char *a, unsigned char *b, int len) {
  size_t vlmax = __riscv_vsetvlmax_e8m1();
  vint32m4_t vec_s = __riscv_vmv_v_x_i32m4(0, vlmax);
  vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vlmax);
  int k = len;
  for (size_t vl; k > 0; k -= vl, a += vl, b += vl) {
    vl = __riscv_vsetvl_e8m1(k);

    vuint8m1_t a8s = __riscv_vle8_v_u8m1(a, vl);
    vuint8m1_t b8s = __riscv_vle8_v_u8m1(b, vl);
    vuint32m4_t a8s_extended = __riscv_vzext_vf4_u32m4(a8s, vl);
    vuint32m4_t b8s_extended = __riscv_vzext_vf4_u32m4(a8s, vl);

    vint32m4_t a8s_as_i32 = __riscv_vreinterpret_v_u32m4_i32m4(a8s_extended);
    vint32m4_t b8s_as_i32 = __riscv_vreinterpret_v_u32m4_i32m4(b8s_extended);

    vec_s = __riscv_vmacc_vv_i32m4_tu(vec_s, a8s_as_i32, b8s_as_i32, vl);
  }

  vint32m1_t vec_sum = __riscv_vredsum_vs_i32m4_i32m1(vec_s, vec_zero, __riscv_vsetvl_e32m4(len));
  int sum = __riscv_vmv_x_s_i32m1_i32(vec_sum);

  return sum;
}

topperc commented 1 year ago

There's a typo on this line vuint32m4_t b8s_extended = __riscv_vzext_vf4_u32m4(a8s, vl); The argument should be b8s

Compiling with -Wall does give a warning for b8s being unused.

tomhepworth commented 1 year ago

I hang my head in shame

Thanks so much!

riscv-non-isa / rvv-intrinsic-doc

Question about widening and reinterpreting inside a loop #218