pulp-platform / snitch_cluster

An energy-efficient RISC-V floating-point compute cluster.
https://pulp-platform.github.io/snitch_cluster/
Apache License 2.0
52 stars 55 forks source link

Synchronization between FPU and INT pipelines for arbitrary C code #84

Closed and-ivanov closed 9 months ago

and-ivanov commented 10 months ago

The output of cycle-accurate simulation for this code is not correct:

#include <snrt.h>
#include <printf.h>
#define f64 double
#define i32 int

#define B 2
#define N 32

void __attribute__((noinline)) my_func(double* x, double* y) {
    for (int n = 0; n < N; n++) {
        for (int b = 0; b < B; b++) {
            x[b * N + n] *= y[b];
        }
    }
    for (int b = 0; b < B; b++) {
        y[b] = 0;
    }
}

int main() {
    unsigned tid = snrt_cluster_core_idx();
    if (tid != 0) {
        return 0;
    }
    double* y = (f64*) snrt_l1alloc(B * sizeof(f64));
    double* x = (f64*) snrt_l1alloc(B * N * sizeof(f64));
    double* z = (f64*) snrt_l1alloc(B * N * sizeof(f64));
    y[0] = 3.0;
    y[1] = 2.0;
    for (int n = 0; n < N; n++) {
        for (int b = 0; b < B; b++) {
            x[b * N + n] = n + 1;
            z[b * N + n] = (n + 1) * y[b];
        }
    }
    my_func(x, y);
    i32 ok = 1;
    for (int i = 0; i < B * N; i++) {
        if ((x[i] - z[i]) * (x[i] - z[i]) > 1e-3) {
            printf("Error: mismatch at dst, %d, %f (computed) != %f (expected) \n", (int)i, (double)x[i], (double)z[i]);
            ok = 0;
            break;
        }
    }
    if (ok) {
        printf("success, exitting...\n");
        return 0;
    } else {
        printf("FAILURE, exitting...\n");
        return 1;
    }
}

Observed output:

Error: mismatch at dst, 31, 0.000000 (computed) != 96.000000 (expected)

The issue is suspected to come from the lack of synchronization between INT and FPU units. It can be seen from the assembly https://godbolt.org/z/z3oEz4aen that no synchronization is even supposed to happen.

my_func:                                # @my_func
        fld     ft0, 0(a1)  # everything below goes to FPU 
        fld     ft1, 0(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 0(a0)
        fld     ft0, 8(a1)
        fld     ft1, 32(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 32(a0)
        fld     ft0, 0(a1)
        fld     ft1, 8(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 8(a0)
        fld     ft0, 8(a1)
        fld     ft1, 40(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 40(a0)
        fld     ft0, 0(a1)
        fld     ft1, 16(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 16(a0)
        fld     ft0, 8(a1)
        fld     ft1, 48(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 48(a0)
        fld     ft0, 0(a1)
        fld     ft1, 24(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 24(a0)
        fld     ft0, 8(a1)
        fld     ft1, 56(a0)
        fmul.d  ft0, ft0, ft1
        fsd     ft0, 56(a0)
        sw      zero, 12(a1)  # everything below goes to INT
        sw      zero, 8(a1)
        sw      zero, 4(a1)
        sw      zero, 0(a1)
        ret
colluca commented 9 months ago

This issue is currently being addressed in https://github.com/pulp-platform/snitch_cluster/pull/90.

colluca commented 9 months ago

Solved in https://github.com/pulp-platform/snitch_cluster/pull/90.