Open JamesKenneyImperas opened 4 years ago
The simplest change that should be made is to transpose the j/k loops:
before: for i for j t = 0 for k t += a[i,k] * b[k,j] end k c[i,j] = t end j end i
after: for i c[i,] = 0 for k ta = a[i,k] for j c[i,j] += ta b[k,j] end j end k end i
this makes it both cache-friendly and easily vectorized because it iterates over B[] in row-major order and accesses A as scalars.
if the rows are short enough, you can tile the k loop on top of this, so you get some temporal re-use out of rows in B, but it has diminishing returns.
if the rows are super-long, then you need to tile across j as well.
g
On Thu, Jun 11, 2020 at 6:09 AM JamesKenneyImperas notifications@github.com wrote:
I've attached an attempt at a new example that could be included in the example directory here - matrix multiplication of matrices containing single-precision floats. I think something like this might be useful to show, because it is a bit more complicated than the simple memcpy-style examples but less inscrutable than the partial sgemm.S one. I've also included a C test harness to demonstrate use.
I'm sure there are much better or more correct ways to implement matrix multiplication, but hopefully this is a useful starting point. The example is:
.text .balign 4 .global matmulFloat_v
void matmulFloat_v(
size_t n, // a0: A columns, B rows
size_t m, // a1: A rows
size_t p, // a2: B columns
const float *A, // a3: A is m x n matrix
const float *B, // a4: B is n x p matrix
float *C // a5: B is m x p matrix
) {
int i, j, k;
#
for (i = 0; i < m; i++) {
for (j = 0; j < p; j++) {
float c = 0;
for (k = 0; k < n; k++) {
c += A[in+k] B[k*p+j];
}
C[i*p+j] = c;
}
}
}
define n a0
define m a1
define p a2
define A a3
define B a4
define C a5
define i t0
define j t1
define k t2
define ksub t3
define bstride t4
define tmp t5
matmulFloat_v: li t0, 0 slli t4, a2, 2 # calculate B block stride 1: li t1, 0 2: li t2, 0 li t5, 1 vsetvli t5, t5, e32,m1,ta,ma # set vector length 1 vxor.vv v0, v0, v0 # initialize c 3: sub t3, a0, t2 # calculate remaining k vsetvli t3, t3, e32,m8,ta,ma # prepare for next k block
mul t5, t0, a0 # calculate A block base address add t5, t5, t2 slli t5, t5, 2 add t5, t5, a3
vle32.v v8, (t5); # load A block
mul t5, t2, a2 # calculate B block base address add t5, t5, t1 slli t5, t5, 2 add t5, t5, a4
vlse32.v v16, (t5), t4; # load B block
vfmul.vv v8, v8, v16 # multiply N elements vfredsum.vs v0, v8, v0 # accumulate result in element 0 of v0
add t2, t2, t3 bne t2, a0, 3b # do next k block li t5, 1
vsetvli t5, t5, e32,m1,ta,ma # set vector length 1
mul t5, t0, a2 # calculate C block address add t5, t5, t1 slli t5, t5, 2 add t5, t5, a5
vse32.v v0, (t5); # store C block element
addi t1, t1, 1 bne t1, a2, 2b # do next j addi t0, t0, 1 bne t0, a1, 1b # do next i ret
The algorithm implemented is shown in the C-style comment. It multiplies m x n matrix A and n x p matrix B, writing the result to m x p matrix C. The vector code uses unit stride and strided loads, register grouping, floating point multiply and floating point reduction operations. The C test harness is this:
include
include
// // Enable vector extension and floating point // extern void init_v(void);
// // Vector implementation // extern void matmulFloat_v( size_t n, // A columns, B rows size_t m, // A rows size_t p, // B columns const float A, // A is m x n matrix const float B, // B is n x p matrix float *C // B is m x p matrix );
// // Non-vector implementation // static void matmulFloat( size_t n, // A columns, B rows size_t m, // A rows size_t p, // B columns const float A, // A is m x n matrix const float B, // B is n x p matrix float *C // B is m x p matrix ) { int i, j, k;
for (i = 0; i < m; i++) { for (j = 0; j < p; j++) { float c = 0; for (k = 0; k < n; k++) { c += A[i*n+k] * B[k*p+j]; } C[i*p+j] = c; } }
}
// // Exercise matrix multiplication // void testMatMul (int n, int m, int p) {
float A[m][n]; float B[n][p]; float C[m][p]; float aVal = 0.0; float bVal = 0.0; int r, c; printf("\n-----------------------------------------\n"); printf("TEST MATRIX MULTIPLICATION N=%d M=%d P=%d\n", n, m, p); printf("-----------------------------------------\n"); // fill A with known values printf("\n--\nA:\n--\n"); for (r = 0; r < m; r++) { printf(" "); for (c = 0; c < n; c++) { A[r][c] = aVal; aVal += 0.25; printf(" %7.2f", A[r][c]); } printf("\n"); } // fill B with known values printf("\n--\nB:\n--\n"); for (r = 0; r < n; r++) { printf(" "); for (c = 0; c < p; c++) { B[r][c] = bVal; bVal += 1; printf(" %7.2f", B[r][c]); } printf("\n"); } // do matrix multiplication if(1) { matmulFloat_v(n, m, p, &A[0][0], &B[0][0], &C[0][0]); } else { matmulFloat(n, m, p, &A[0][0], &B[0][0], &C[0][0]); } // show result printf("\n--\nC:\n--\n"); for (r = 0; r < m; r++) { printf(" "); for (c = 0; c < p; c++) { printf(" %9.2f", C[r][c]); } printf("\n"); }
}
// // Main routine // int main(int argc, char ** argv) {
init_v(); testMatMul(2, 4, 6); testMatMul(18, 31, 23); return -1;
}
(The init_v function enables floating point and vector instructions. This may or may not be needed depending on the simulation environment:
init_v: li t0, 1<<9+1<<13 csrs mstatus, t0 ret
)
The C code will perform a multiply using either the vector implementation or a traditional non-vector one so the results can be compared (change the if(1) line in the obvious way). The test function testMatMul defines and fills appropriately-sized local matrixes with a simple pattern of values so the results can be seen. For example the testMatMul(2, 4, 6) call produces this output:
TEST MATRIX MULTIPLICATION N=2 M=4 P=6
-- A:
0.00 0.25 0.50 0.75 1.00 1.25 1.50 1.75
-- B:
0.00 1.00 2.00 3.00 4.00 5.00 6.00 7.00 8.00 9.00 10.00 11.00
-- C:
1.50 1.75 2.00 2.25 2.50 2.75 4.50 5.75 7.00 8.25 9.50 10.75 7.50 9.75 12.00 14.25 16.50 18.75 10.50 13.75 17.00 20.25 23.50 26.75
The test function will accept any values of n, m and p sizes, so this can be used to investigate the performance of the algorithm on any shape input.
The algorithm is not optimized in any way, so I am sure it can be improved.
— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/riscv/riscv-v-spec/issues/495, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABMAPWMIFSVNVMQDPFWF62LRWDJQLANCNFSM4N3MX6ZA .
Thanks Guy - I'll transpose the loops and see what that looks like. I wasn't really trying to make a fully-optimized algorithm but simply an example and test harness for new users to play with to help gain understanding of how the instructions work, to supplement the existing examples in this repository. I think having two candidate implementations is even better.
Is there a better forum for this thread? Perhaps an existing set of standard examples somewhere else that I am unaware of?
Just to finish off here, I've attached a second version of matmulFloat_v
with transposed j
/k
loops as Guy suggested. It works for values of p
<=VL
when LMUL
=8 (e.g. p
<=64 when VLEN
=512), and otherwise branches to a slower version (e.g. the previous implementation in this thread, renamed to matmulFloat_slow
). This isn't an area I know anything about (as is clear from the code!) and I repeat that my objective is simply to help new users get started with something they can experiment with to easily see the effect of changing matrix sizes with the vector instructions.
.text
.balign 4
.global matmulFloat_v
# void matmulFloat_v(
# size_t n, // a0: A columns, B rows
# size_t m, // a1: A rows
# size_t p, // a2: B columns
# const float *A, // a3: A is m x n matrix
# const float *B, // a4: B is n x p matrix
# float *C // a5: C is m x p matrix
# ) {
# int i, j, k;
#
# for (i = 0; i < m; i++) {
# c[i,*] = 0;
# for (k = 0; k < n; k++) {
# float tmpf = A[i*n+k];
# for (j = 0; j < p; j++) {
# c[i,j] += tmpf * B[k*p+j];
# }
# }
# }
# }
#
# n a0
# m a1
# p a2
# A a3
# B a4
# C a5
# i t0
# j t1
# k t2
# tmp t3
# tmpf ft0
matmulFloat_v:
vsetvli t3, a2, e32,m8,ta,ma # set vector length p
bne t3, a2, matmulFloat_slow # use slow algorithm if p>VL
li t0, 0
1:
vxor.vv v0, v0, v0 # c[i,*] = 0
li t2, 0
2:
flw ft0, (a3); # load A value
addi a3, a3, 4 # increment A base
mul t3, t2, a2 # calculate B block base address
slli t3, t3, 2
add t3, t3, a4
vle32.v v8, (t3); # load B block
vfmacc.vf v0, ft0, v8 # c[i,*] += ta * b[k,*]
addi t2, t2, 1
bne t2, a0, 2b # do next k
mul t3, t0, a2 # calculate C block base address
slli t3, t3, 2
add t3, t3, a5
vse32.v v0, (t3); # store C block
addi t0, t0, 1
bne t0, a1, 1b # do next i
ret
Just to finish off here, I've attached a second version of
matmulFloat_v
with transposedj
/k
loops as Guy suggested. It works for values ofp
<=VL
whenLMUL
=8 (e.g.p
<=64 whenVLEN
=512), and otherwise branches to a slower version (e.g. the previous implementation in this thread, renamed tomatmulFloat_slow
). This isn't an area I know anything about (as is clear from the code!) and I repeat that my objective is simply to help new users get started with something they can experiment with to easily see the effect of changing matrix sizes with the vector instructions..text .balign 4 .global matmulFloat_v # void matmulFloat_v( # size_t n, // a0: A columns, B rows # size_t m, // a1: A rows # size_t p, // a2: B columns # const float *A, // a3: A is m x n matrix # const float *B, // a4: B is n x p matrix # float *C // a5: C is m x p matrix # ) { # int i, j, k; # # for (i = 0; i < m; i++) { # c[i,*] = 0; # for (k = 0; k < n; k++) { # float tmpf = A[i*n+k]; # for (j = 0; j < p; j++) { # c[i,j] += tmpf * B[k*p+j]; # } # } # } # } # # n a0 # m a1 # p a2 # A a3 # B a4 # C a5 # i t0 # j t1 # k t2 # tmp t3 # tmpf ft0 matmulFloat_v: vsetvli t3, a2, e32,m8,ta,ma # set vector length p bne t3, a2, matmulFloat_slow # use slow algorithm if p>VL li t0, 0 1: vxor.vv v0, v0, v0 # c[i,*] = 0 li t2, 0 2: flw ft0, (a3); # load A value addi a3, a3, 4 # increment A base mul t3, t2, a2 # calculate B block base address slli t3, t3, 2 add t3, t3, a4 vle32.v v8, (t3); # load B block vfmacc.vf v0, ft0, v8 # c[i,*] += ta * b[k,*] addi t2, t2, 1 bne t2, a0, 2b # do next k mul t3, t0, a2 # calculate C block base address slli t3, t3, 2 add t3, t3, a5 vse32.v v0, (t3); # store C block addi t0, t0, 1 bne t0, a1, 1b # do next i ret
there is a problem in sgemm.S, when I try to compile the test_sgemm.c and sgemm.S to elf file, there are errors about "operand must be a symbol with %tprel_add modifier", I notice that this is your diy_sgemm.S, did you meet with this error when compiling?
sgemm_nn:
# addi sp, sp, -FRAMESIZE
# sd s0, OFFSET(sp)
# sd s1, OFFSET(sp)
# sd s2, OFFSET(sp)
Hi William,
As you've noticed on this thread and in your earlier thread https://github.com/riscv/riscv-v-spec/issues/450, the sgemm.S
example is incomplete in several ways. One of which is the logic for saving and restoring the callee-saved registers (s0
, s1
, etc.). In particular, as you've noticed, the example expects you, the programmer, to determine the correct values of FRAMESIZE
and OFFSET
(actually, you'd need three different OFFSET
s...). To learn about the correct choices, you should study the (System V) RISC-V psABI.
You'll also notice that @JamesKenneyImperas 's example sidesteps this problem by not using any saved registers.
Best, Nick Knight
EDIT: I'll give you a hint. For the prologue code, you could do:
addi sp, sp, -16
sd s0, 0(sp)
sd s1, 4(sp)
sd s2, 8(sp)
and for the epilogue code, you could do:
exit:
ld s0, 0(sp)
ld s1, 4(sp)
ld s2, 8(sp)
addi sp, sp, 16
ret
EDIT 2: I'll leave it as a challenge to you to figure out how to deal with the OFFSET
in the line
# Convert elements strides to byte strides.
ld cstride, OFFSET(sp) # Get arg from stack frame
I've attached an attempt at a new example that could be included in the
example
directory here - matrix multiplication of matrices containing single-precision floats. I think something like this might be useful to show, because it is a bit more complicated than the simplememcpy
-style examples but less inscrutable than the partialsgemm.S
one. I've also included a C test harness to demonstrate use.I'm sure there are much better or more correct ways to implement matrix multiplication, but hopefully this is a useful starting point. The example is:
The algorithm implemented is shown in the C-style comment. It multiplies m x n matrix A and n x p matrix B, writing the result to m x p matrix C. The vector code uses unit stride and strided loads, register grouping, floating point multiply and floating point reduction operations. The C test harness is this:
(The
init_v
function enables floating point and vector instructions. This may or may not be needed depending on the simulation environment:)
The C code will perform a multiply using either the vector implementation or a traditional non-vector one so the results can be compared (change the if(1) line in the obvious way). The test function testMatMul defines and fills appropriately-sized local matrixes with a simple pattern of values so the results can be seen. For example the testMatMul(2, 4, 6) call produces this output:
The test function will accept any values of n, m and p sizes, so this can be used to investigate the performance of the algorithm on any shape input.
The algorithm is not optimized in any way, so I am sure it can be improved.