The output of cycle-accurate simulation for this code is not correct:
#include <snrt.h>
#include <printf.h>
#define f64 double
#define i32 int
#define B 2
#define N 32
void __attribute__((noinline)) my_func(double* x, double* y) {
for (int n = 0; n < N; n++) {
for (int b = 0; b < B; b++) {
x[b * N + n] *= y[b];
}
}
for (int b = 0; b < B; b++) {
y[b] = 0;
}
}
int main() {
unsigned tid = snrt_cluster_core_idx();
if (tid != 0) {
return 0;
}
double* y = (f64*) snrt_l1alloc(B * sizeof(f64));
double* x = (f64*) snrt_l1alloc(B * N * sizeof(f64));
double* z = (f64*) snrt_l1alloc(B * N * sizeof(f64));
y[0] = 3.0;
y[1] = 2.0;
for (int n = 0; n < N; n++) {
for (int b = 0; b < B; b++) {
x[b * N + n] = n + 1;
z[b * N + n] = (n + 1) * y[b];
}
}
my_func(x, y);
i32 ok = 1;
for (int i = 0; i < B * N; i++) {
if ((x[i] - z[i]) * (x[i] - z[i]) > 1e-3) {
printf("Error: mismatch at dst, %d, %f (computed) != %f (expected) \n", (int)i, (double)x[i], (double)z[i]);
ok = 0;
break;
}
}
if (ok) {
printf("success, exitting...\n");
return 0;
} else {
printf("FAILURE, exitting...\n");
return 1;
}
}
Observed output:
Error: mismatch at dst, 31, 0.000000 (computed) != 96.000000 (expected)
The issue is suspected to come from the lack of synchronization between INT and FPU units. It can be seen from the assembly https://godbolt.org/z/z3oEz4aen that no synchronization is even supposed to happen.
The output of cycle-accurate simulation for this code is not correct:
Observed output:
The issue is suspected to come from the lack of synchronization between INT and FPU units. It can be seen from the assembly https://godbolt.org/z/z3oEz4aen that no synchronization is even supposed to happen.