ucb-bar / gemmini

Berkeley's Spatial Array Generator
Other
796 stars 165 forks source link

Softmax implementation issue #300

Open LordScarface opened 1 year ago

LordScarface commented 1 year ago

Hello, I just started with Gemmini and Chipyard and not I am facing some issues with the Softmax, GELU and LayerNorm activation functions.

I got Gemmini running and I get the correct results for matrix multiplications and RELU activation, but Softmax causes a crash in the simulator and also on the FPGA implementation.

I built a baremetal app according to the gemmini-rocc-tests repo that just does a matrix multiplication and then a Softmax, this is the code:

```C #include #include #include #include #include #ifndef BAREMETAL #include #endif #include "include/gemmini_testutils.h" #define CHECK_RESULT 1 #define NO_BIAS 1 #define FULL_BIAS_WIDTH 1 #define BERT_SCALE 0.05 #if FULL_BIAS_WIDTH typedef acc_t ACC_T; #else typedef elem_t ACC_T; #endif #define MAT_DIM_I 2 #define MAT_DIM_K 2 #define MAT_DIM_J 2 void full_printMatrix(elem_t m[MAT_DIM_I][MAT_DIM_J]) { for (size_t i = 0; i < MAT_DIM_I; ++i) { for (size_t j = 0; j < MAT_DIM_J; ++j) { #ifdef ELEM_T_IS_FLOAT printf("%f ", (double)m[i][j]); #else printf("%ld ", m[i][j]); #endif } printf("\n"); } printf("\n"); } int full_is_equal(elem_t x[MAT_DIM_I][MAT_DIM_J], elem_t y[MAT_DIM_I][MAT_DIM_J]) { for (size_t i = 0; i < MAT_DIM_I; ++i) for (size_t j = 0; j < MAT_DIM_J; ++j) if (x[i][j] != y[i][j]) return 0; return 1; } int main() { #ifndef BAREMETAL if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { perror("mlockall failed"); exit(1); } #endif gemmini_flush(0); static elem_t full_A[MAT_DIM_I][MAT_DIM_K] row_align(1) = {{-7 ,5}, {-120, 7}}; static elem_t full_B[MAT_DIM_K][MAT_DIM_J] row_align(1) = {{1,0}, {0,1}}; static elem_t A_at_B_CPU[MAT_DIM_I][MAT_DIM_J] row_align(1); static elem_t A_at_B_GEMM[MAT_DIM_I][MAT_DIM_J] row_align(1); static ACC_T full_D[MAT_DIM_I][MAT_DIM_J] row_align_acc(1) = {{0,0}, {0,0}}; // the bias static elem_t Softmax_CPU[MAT_DIM_I][MAT_DIM_J]; static elem_t Softmax_GEMM[MAT_DIM_I][MAT_DIM_J]; /* printf("Starting slow CPU matmul\n"); unsigned long cpu_start = read_cycles(); tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)A_at_B_CPU, MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, NO_ACTIVATION, ACC_SCALE_IDENTITY, BERT_SCALE, false, false, false, false, !FULL_BIAS_WIDTH, 0, CPU); tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)Softmax_CPU, MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, SOFTMAX, ACC_SCALE_IDENTITY, BERT_SCALE, false, false, false, false, !FULL_BIAS_WIDTH, 0, CPU); unsigned long cpu_end = read_cycles(); printf("Cycles taken: %u\n", cpu_end-cpu_start); printf("Starting gemmini matmul\n"); unsigned long start = read_cycles(); tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)A_at_B_GEMM, MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, NO_ACTIVATION, ACC_SCALE_IDENTITY, BERT_SCALE, false, false, false, false, !FULL_BIAS_WIDTH, 0, WS); */ tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K, (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)Softmax_GEMM, MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, SOFTMAX, ACC_SCALE_IDENTITY, BERT_SCALE, false, false, false, false, !FULL_BIAS_WIDTH, 0, WS); //unsigned long end = read_cycles(); //printf("Cycles taken: %u\n", end-start); /* printf("A:\n"); full_printMatrix(full_A); printf("B:\n"); full_printMatrix(full_B); printf("A @ B (CPU):\n"); full_printMatrix(A_at_B_CPU); printf("A @ B (Gemmini):\n"); full_printMatrix(A_at_B_GEMM); printf("Softmax (CPU):\n"); full_printMatrix(Softmax_CPU); */ printf("Softmax (Gemmini):\n"); full_printMatrix(Softmax_GEMM); printf("\n"); exit(0); } ```

I then compiled verilator with the ./scripts/build-verilator.sh command and ran the test with ./scripts/run-verilator.sh $(which ./software/gemmini-rocc-tests/build/transformers/softmax_test-baremetal).

I also modified the xcustom.h to print all commands that are executed on Gemmini.

Here is the output:

This emulator compiled with JTAG Remote Bitbang client. To enable, use +jtag_rbb_enable=1.
Listening on port 45603
[UART] UART0 is here (stdin/stdout).
asdd  
Testing float print: 1.0: %f -- 123.456789: %f -- -12.765: %f -- -67894.0654561: %f -- 1.012345487: %f
GEMMINI 0RR CUSTOM_3, 0x3, 7, rs1=00000000, rs2=00000000 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00010004, rs2=00000000 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00000002, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040101, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040109, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040111, rs2=00000008 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00010003, rs2=0000001b 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00050003, rs2=0000001b 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 9, rs1=00020002, rs2=00010001 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 10, rs1=80002740, rs2=8000273c 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 11, rs1=00000000, rs2=80002748 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 12, rs1=00000002, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 13, rs1=00000002, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 8, rs1=00000400, rs2=00000000 
[94000] %Error: chipyard.TestHarness.CustomGemminiSoCConfig.top.v:304744: Assertion failed in TOP.TestHarness.chiptop.system.tile_prci_domain.tile_reset_domain.tile.gemmini.load_controller
%Error: /home/lukas/Documents/Gemmini/chipyard/sims/verilator/generated-src/chipyard.TestHarness.CustomGemminiSoCConfig/chipyard.TestHarness.CustomGemminiSoCConfig.top.v:304744: Verilog $stop
Aborting...

So the last command fails, which is the following from gemmini.h :

// weight-stationary matmul loop
#define gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_stride, B_stride, D_stride, C_stride, A_transpose, B_transpose, full_C, low_D, ex_accumulate, act) \
  { \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \
    -> This fails : ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(act) << 8) | ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) \
  }

The corresponding assert that fails is in DMACommandTracker Line 89

If I just change the activation to be RELU it works as expected.

Here are the Versions I am using:

I hope someone can help me with this, Best Regards, Lukas

LordScarface commented 1 year ago

Okay so I was missing the has_normalizations=true flag, it is working now. But I was wondering if it would be feasible to also add a floating point implementation of Softmax and GELU?

hngenc commented 10 months ago

I was wondering if it would be feasible to also add a floating point implementation of Softmax and GELU?

That would be a great feature to have for sure, but it's not planned for the near future, just due to a lack of manpower. Hopefully, we get someone who wants to start working on that, or an outside contributor makes a PR to add that feature.

For now, Gemmini's transformer support is targeted towards I-BERT, rather than floating-point BERT implementations