riscvarchive / riscv-v-spec

Working draft of the proposed RISC-V V vector extension
https://jira.riscv.org/browse/RVG-122
Creative Commons Attribution 4.0 International
968 stars 272 forks source link

Matrix Multiplication Example #495

Open JamesKenneyImperas opened 4 years ago

JamesKenneyImperas commented 4 years ago

I've attached an attempt at a new example that could be included in the example directory here - matrix multiplication of matrices containing single-precision floats. I think something like this might be useful to show, because it is a bit more complicated than the simple memcpy-style examples but less inscrutable than the partial sgemm.S one. I've also included a C test harness to demonstrate use.

I'm sure there are much better or more correct ways to implement matrix multiplication, but hopefully this is a useful starting point. The example is:

.text
.balign 4
.global matmulFloat_v

# void matmulFloat_v(
#     size_t       n,     // a0: A columns, B rows
#     size_t       m,     // a1: A rows
#     size_t       p,     // a2: B columns
#     const float *A,     // a3: A is m x n matrix
#     const float *B,     // a4: B is n x p matrix
#     float       *C      // a5: C is m x p matrix
# ) {
#     int i, j, k;
# 
#     for (i = 0; i < m; i++) {
#         for (j = 0; j < p; j++) {
#             float c = 0;
#             for (k = 0; k < n; k++) {
#                 c += A[i*n+k] * B[k*p+j];
#             }
#             C[i*p+j] = c;
#         }
#     }
# }

#define n       a0
#define m       a1
#define p       a2
#define A       a3
#define B       a4
#define C       a5

#define i       t0
#define j       t1
#define k       t2
#define ksub    t3
#define bstride t4
#define tmp     t5

matmulFloat_v:
    li t0, 0
    slli t4, a2, 2                  # calculate B block stride
1:
    li t1, 0
2:
    li t2, 0
    li t5, 1
  vsetvli t5, t5, e32,m1,ta,ma      # set vector length 1
  vxor.vv v0, v0, v0                # initialize c
3:
    sub t3, a0, t2                  # calculate remaining k
  vsetvli t3, t3, e32,m8,ta,ma      # prepare for next k block

    mul t5, t0, a0                  # calculate A block base address
    add t5, t5, t2
    slli t5, t5, 2
    add t5, t5, a3

  vle32.v v8, (t5);                 # load A block

    mul t5, t2, a2                  # calculate B block base address
    add t5, t5, t1
    slli t5, t5, 2
    add t5, t5, a4

  vlse32.v v16, (t5), t4;           # load B block

  vfmul.vv v8, v8, v16              # multiply N elements
  vfredsum.vs v0, v8, v0            # accumulate result in element 0 of v0

    add t2, t2, t3
    bne t2, a0, 3b                  # do next k block

    li t5, 1
  vsetvli t5, t5, e32,m1,ta,ma      # set vector length 1

  vse32.v v0, (a5);                 # store C block element

    addi a5, a5, 4                  # increment C block address

    addi t1, t1, 1
    bne t1, a2, 2b                  # do next j

    addi t0, t0, 1
    bne t0, a1, 1b                  # do next i

    ret

The algorithm implemented is shown in the C-style comment. It multiplies m x n matrix A and n x p matrix B, writing the result to m x p matrix C. The vector code uses unit stride and strided loads, register grouping, floating point multiply and floating point reduction operations. The C test harness is this:

#include <stdio.h>
#include <stdlib.h>

//
// Enable vector extension and floating point
//
extern void init_v(void);

//
// Vector implementation
//
extern void matmulFloat_v(
    size_t       n,     // A columns, B rows
    size_t       m,     // A rows
    size_t       p,     // B columns
    const float *A,     // A is m x n matrix
    const float *B,     // B is n x p matrix
    float       *C      // C is m x p matrix
);

//
// Non-vector implementation
//
static void matmulFloat(
    size_t       n,     // A columns, B rows
    size_t       m,     // A rows
    size_t       p,     // B columns
    const float *A,     // A is m x n matrix
    const float *B,     // B is n x p matrix
    float       *C      // C is m x p matrix
) {
    int i, j, k;

    for (i = 0; i < m; i++) {
        for (j = 0; j < p; j++) {
            float c = 0;
            for (k = 0; k < n; k++) {
                c += A[i*n+k] * B[k*p+j];
            }
            C[i*p+j] = c;
        }
    }
}

//
// Exercise matrix multiplication
//
void testMatMul (int n, int m, int p) {

    float A[m][n];
    float B[n][p];
    float C[m][p];
    float aVal = 0.0;
    float bVal = 0.0;
    int   r, c;

    printf("\n-----------------------------------------\n");
    printf("TEST MATRIX MULTIPLICATION N=%d M=%d P=%d\n", n, m, p);
    printf("-----------------------------------------\n");

    // fill A with known values
    printf("\n--\nA:\n--\n");
    for (r = 0; r < m; r++) {
        printf("   ");
        for (c = 0; c < n; c++) {
            A[r][c] = aVal;
            aVal += 0.25;
            printf(" %7.2f", A[r][c]);
        }
        printf("\n");
    }

    // fill B with known values
    printf("\n--\nB:\n--\n");
    for (r = 0; r < n; r++) {
        printf("   ");
        for (c = 0; c < p; c++) {
            B[r][c] = bVal;
            bVal += 1;
            printf(" %7.2f", B[r][c]);
        }
        printf("\n");
    }

    // do matrix multiplication
    if(1) {
        matmulFloat_v(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
    } else {
        matmulFloat(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
    }

    // show result
    printf("\n--\nC:\n--\n");
    for (r = 0; r < m; r++) {
        printf("   ");
        for (c = 0; c < p; c++) {
            printf(" %9.2f", C[r][c]);
        }
        printf("\n");
    }
}

//
// Main routine
//
int main(int argc, char ** argv) {

    init_v();

    testMatMul(2, 4, 6);
    testMatMul(18, 31, 23);

    return -1;
}

(The init_v function enables floating point and vector instructions. This may or may not be needed depending on the simulation environment:

init_v:
    li t0, 1<<9+1<<13
    csrs mstatus, t0
    ret

)

The C code will perform a multiply using either the vector implementation or a traditional non-vector one so the results can be compared (change the if(1) line in the obvious way). The test function testMatMul defines and fills appropriately-sized local matrixes with a simple pattern of values so the results can be seen. For example the testMatMul(2, 4, 6) call produces this output:

-----------------------------------------
TEST MATRIX MULTIPLICATION N=2 M=4 P=6
-----------------------------------------

--
A:
--
       0.00    0.25
       0.50    0.75
       1.00    1.25
       1.50    1.75

--
B:
--
       0.00    1.00    2.00    3.00    4.00    5.00
       6.00    7.00    8.00    9.00   10.00   11.00

--
C:
--
         1.50      1.75      2.00      2.25      2.50      2.75
         4.50      5.75      7.00      8.25      9.50     10.75
         7.50      9.75     12.00     14.25     16.50     18.75
        10.50     13.75     17.00     20.25     23.50     26.75

The test function will accept any values of n, m and p sizes, so this can be used to investigate the performance of the algorithm on any shape input.

The algorithm is not optimized in any way, so I am sure it can be improved.

vbx-glemieux commented 4 years ago

The simplest change that should be made is to transpose the j/k loops:

before: for i for j t = 0 for k t += a[i,k] * b[k,j] end k c[i,j] = t end j end i

after: for i c[i,] = 0 for k ta = a[i,k] for j c[i,j] += ta b[k,j] end j end k end i

this makes it both cache-friendly and easily vectorized because it iterates over B[] in row-major order and accesses A as scalars.

if the rows are short enough, you can tile the k loop on top of this, so you get some temporal re-use out of rows in B, but it has diminishing returns.

if the rows are super-long, then you need to tile across j as well.

g

On Thu, Jun 11, 2020 at 6:09 AM JamesKenneyImperas notifications@github.com wrote:

I've attached an attempt at a new example that could be included in the example directory here - matrix multiplication of matrices containing single-precision floats. I think something like this might be useful to show, because it is a bit more complicated than the simple memcpy-style examples but less inscrutable than the partial sgemm.S one. I've also included a C test harness to demonstrate use.

I'm sure there are much better or more correct ways to implement matrix multiplication, but hopefully this is a useful starting point. The example is:

.text .balign 4 .global matmulFloat_v

void matmulFloat_v(

size_t n, // a0: A columns, B rows

size_t m, // a1: A rows

size_t p, // a2: B columns

const float *A, // a3: A is m x n matrix

const float *B, // a4: B is n x p matrix

float *C // a5: B is m x p matrix

) {

int i, j, k;

#

for (i = 0; i < m; i++) {

for (j = 0; j < p; j++) {

float c = 0;

for (k = 0; k < n; k++) {

c += A[in+k] B[k*p+j];

}

C[i*p+j] = c;

}

}

}

define n a0

define m a1

define p a2

define A a3

define B a4

define C a5

define i t0

define j t1

define k t2

define ksub t3

define bstride t4

define tmp t5

matmulFloat_v: li t0, 0 slli t4, a2, 2 # calculate B block stride 1: li t1, 0 2: li t2, 0 li t5, 1 vsetvli t5, t5, e32,m1,ta,ma # set vector length 1 vxor.vv v0, v0, v0 # initialize c 3: sub t3, a0, t2 # calculate remaining k vsetvli t3, t3, e32,m8,ta,ma # prepare for next k block

mul t5, t0, a0                  # calculate A block base address
add t5, t5, t2
slli t5, t5, 2
add t5, t5, a3

vle32.v v8, (t5); # load A block

mul t5, t2, a2                  # calculate B block base address
add t5, t5, t1
slli t5, t5, 2
add t5, t5, a4

vlse32.v v16, (t5), t4; # load B block

vfmul.vv v8, v8, v16 # multiply N elements vfredsum.vs v0, v8, v0 # accumulate result in element 0 of v0

add t2, t2, t3
bne t2, a0, 3b                  # do next k block

li t5, 1

vsetvli t5, t5, e32,m1,ta,ma # set vector length 1

mul t5, t0, a2                  # calculate C block address
add t5, t5, t1
slli t5, t5, 2
add t5, t5, a5

vse32.v v0, (t5); # store C block element

addi t1, t1, 1
bne t1, a2, 2b                  # do next j

addi t0, t0, 1
bne t0, a1, 1b                  # do next i

ret

The algorithm implemented is shown in the C-style comment. It multiplies m x n matrix A and n x p matrix B, writing the result to m x p matrix C. The vector code uses unit stride and strided loads, register grouping, floating point multiply and floating point reduction operations. The C test harness is this:

include

include

// // Enable vector extension and floating point // extern void init_v(void);

// // Vector implementation // extern void matmulFloat_v( size_t n, // A columns, B rows size_t m, // A rows size_t p, // B columns const float A, // A is m x n matrix const float B, // B is n x p matrix float *C // B is m x p matrix );

// // Non-vector implementation // static void matmulFloat( size_t n, // A columns, B rows size_t m, // A rows size_t p, // B columns const float A, // A is m x n matrix const float B, // B is n x p matrix float *C // B is m x p matrix ) { int i, j, k;

for (i = 0; i < m; i++) {
    for (j = 0; j < p; j++) {
        float c = 0;
        for (k = 0; k < n; k++) {
            c += A[i*n+k] * B[k*p+j];
        }
        C[i*p+j] = c;
    }
}

}

// // Exercise matrix multiplication // void testMatMul (int n, int m, int p) {

float A[m][n];
float B[n][p];
float C[m][p];
float aVal = 0.0;
float bVal = 0.0;
int   r, c;

printf("\n-----------------------------------------\n");
printf("TEST MATRIX MULTIPLICATION N=%d M=%d P=%d\n", n, m, p);
printf("-----------------------------------------\n");

// fill A with known values
printf("\n--\nA:\n--\n");
for (r = 0; r < m; r++) {
    printf("   ");
    for (c = 0; c < n; c++) {
        A[r][c] = aVal;
        aVal += 0.25;
        printf(" %7.2f", A[r][c]);
    }
    printf("\n");
}

// fill B with known values
printf("\n--\nB:\n--\n");
for (r = 0; r < n; r++) {
    printf("   ");
    for (c = 0; c < p; c++) {
        B[r][c] = bVal;
        bVal += 1;
        printf(" %7.2f", B[r][c]);
    }
    printf("\n");
}

// do matrix multiplication
if(1) {
    matmulFloat_v(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
} else {
    matmulFloat(n, m, p, &A[0][0], &B[0][0], &C[0][0]);
}

// show result
printf("\n--\nC:\n--\n");
for (r = 0; r < m; r++) {
    printf("   ");
    for (c = 0; c < p; c++) {
        printf(" %9.2f", C[r][c]);
    }
    printf("\n");
}

}

// // Main routine // int main(int argc, char ** argv) {

init_v();

testMatMul(2, 4, 6);
testMatMul(18, 31, 23);

return -1;

}

(The init_v function enables floating point and vector instructions. This may or may not be needed depending on the simulation environment:

init_v: li t0, 1<<9+1<<13 csrs mstatus, t0 ret

)

The C code will perform a multiply using either the vector implementation or a traditional non-vector one so the results can be compared (change the if(1) line in the obvious way). The test function testMatMul defines and fills appropriately-sized local matrixes with a simple pattern of values so the results can be seen. For example the testMatMul(2, 4, 6) call produces this output:


TEST MATRIX MULTIPLICATION N=2 M=4 P=6

-- A:

   0.00    0.25
   0.50    0.75
   1.00    1.25
   1.50    1.75

-- B:

   0.00    1.00    2.00    3.00    4.00    5.00
   6.00    7.00    8.00    9.00   10.00   11.00

-- C:

     1.50      1.75      2.00      2.25      2.50      2.75
     4.50      5.75      7.00      8.25      9.50     10.75
     7.50      9.75     12.00     14.25     16.50     18.75
    10.50     13.75     17.00     20.25     23.50     26.75

The test function will accept any values of n, m and p sizes, so this can be used to investigate the performance of the algorithm on any shape input.

The algorithm is not optimized in any way, so I am sure it can be improved.

— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/riscv/riscv-v-spec/issues/495, or unsubscribe https://github.com/notifications/unsubscribe-auth/ABMAPWMIFSVNVMQDPFWF62LRWDJQLANCNFSM4N3MX6ZA .

JamesKenneyImperas commented 4 years ago

Thanks Guy - I'll transpose the loops and see what that looks like. I wasn't really trying to make a fully-optimized algorithm but simply an example and test harness for new users to play with to help gain understanding of how the instructions work, to supplement the existing examples in this repository. I think having two candidate implementations is even better.

Is there a better forum for this thread? Perhaps an existing set of standard examples somewhere else that I am unaware of?

JamesKenneyImperas commented 4 years ago

Just to finish off here, I've attached a second version of matmulFloat_v with transposed j/k loops as Guy suggested. It works for values of p<=VL when LMUL=8 (e.g. p<=64 when VLEN=512), and otherwise branches to a slower version (e.g. the previous implementation in this thread, renamed to matmulFloat_slow). This isn't an area I know anything about (as is clear from the code!) and I repeat that my objective is simply to help new users get started with something they can experiment with to easily see the effect of changing matrix sizes with the vector instructions.

.text
.balign 4
.global matmulFloat_v

# void matmulFloat_v(
#     size_t       n,     // a0: A columns, B rows
#     size_t       m,     // a1: A rows
#     size_t       p,     // a2: B columns
#     const float *A,     // a3: A is m x n matrix
#     const float *B,     // a4: B is n x p matrix
#     float       *C      // a5: C is m x p matrix
# ) {
#     int i, j, k;
# 
#     for (i = 0; i < m; i++) {
#         c[i,*] = 0;
#         for (k = 0; k < n; k++) {
#             float tmpf = A[i*n+k];
#             for (j = 0; j < p; j++) {
#                 c[i,j] += tmpf * B[k*p+j];
#             }
#         }
#     }
# }
#
#   n       a0
#   m       a1
#   p       a2
#   A       a3
#   B       a4
#   C       a5
#   i       t0
#   j       t1
#   k       t2
#   tmp     t3
#   tmpf    ft0

matmulFloat_v:

  vsetvli t3, a2, e32,m8,ta,ma          # set vector length p
    bne t3, a2, matmulFloat_slow        # use slow algorithm if p>VL

    li t0, 0

1:
  vxor.vv v0, v0, v0                    # c[i,*] = 0

    li t2, 0

2:

    flw ft0, (a3);                      # load A value
    addi a3, a3, 4                      # increment A base

    mul t3, t2, a2                      # calculate B block base address
    slli t3, t3, 2
    add t3, t3, a4

  vle32.v v8, (t3);                     # load B block

  vfmacc.vf v0, ft0, v8                 # c[i,*] += ta * b[k,*]

    addi t2, t2, 1
    bne t2, a0, 2b                      # do next k

    mul t3, t0, a2                      # calculate C block base address
    slli t3, t3, 2
    add t3, t3, a5

  vse32.v v0, (t3);                     # store C block

    addi t0, t0, 1
    bne t0, a1, 1b                      # do next i

    ret
WilliamWangPeng commented 4 years ago

Just to finish off here, I've attached a second version of matmulFloat_v with transposed j/k loops as Guy suggested. It works for values of p<=VL when LMUL=8 (e.g. p<=64 when VLEN=512), and otherwise branches to a slower version (e.g. the previous implementation in this thread, renamed to matmulFloat_slow). This isn't an area I know anything about (as is clear from the code!) and I repeat that my objective is simply to help new users get started with something they can experiment with to easily see the effect of changing matrix sizes with the vector instructions.

.text
.balign 4
.global matmulFloat_v

# void matmulFloat_v(
#     size_t       n,     // a0: A columns, B rows
#     size_t       m,     // a1: A rows
#     size_t       p,     // a2: B columns
#     const float *A,     // a3: A is m x n matrix
#     const float *B,     // a4: B is n x p matrix
#     float       *C      // a5: C is m x p matrix
# ) {
#     int i, j, k;
# 
#     for (i = 0; i < m; i++) {
#         c[i,*] = 0;
#         for (k = 0; k < n; k++) {
#             float tmpf = A[i*n+k];
#             for (j = 0; j < p; j++) {
#                 c[i,j] += tmpf * B[k*p+j];
#             }
#         }
#     }
# }
#
#   n       a0
#   m       a1
#   p       a2
#   A       a3
#   B       a4
#   C       a5
#   i       t0
#   j       t1
#   k       t2
#   tmp     t3
#   tmpf    ft0

matmulFloat_v:

  vsetvli t3, a2, e32,m8,ta,ma          # set vector length p
    bne t3, a2, matmulFloat_slow        # use slow algorithm if p>VL

    li t0, 0

1:
  vxor.vv v0, v0, v0                    # c[i,*] = 0

    li t2, 0

2:

    flw ft0, (a3);                      # load A value
    addi a3, a3, 4                      # increment A base

    mul t3, t2, a2                      # calculate B block base address
    slli t3, t3, 2
    add t3, t3, a4

  vle32.v v8, (t3);                     # load B block

  vfmacc.vf v0, ft0, v8                 # c[i,*] += ta * b[k,*]

    addi t2, t2, 1
    bne t2, a0, 2b                      # do next k

    mul t3, t0, a2                      # calculate C block base address
    slli t3, t3, 2
    add t3, t3, a5

  vse32.v v0, (t3);                     # store C block

    addi t0, t0, 1
    bne t0, a1, 1b                      # do next i

    ret

there is a problem in sgemm.S, when I try to compile the test_sgemm.c and sgemm.S to elf file, there are errors about "operand must be a symbol with %tprel_add modifier", I notice that this is your diy_sgemm.S, did you meet with this error when compiling?

sgemm_nn:
#    addi sp, sp, -FRAMESIZE
#    sd s0, OFFSET(sp)
#    sd s1, OFFSET(sp)
#    sd s2, OFFSET(sp)  

image

nick-knight commented 4 years ago

Hi William,

As you've noticed on this thread and in your earlier thread https://github.com/riscv/riscv-v-spec/issues/450, the sgemm.S example is incomplete in several ways. One of which is the logic for saving and restoring the callee-saved registers (s0, s1, etc.). In particular, as you've noticed, the example expects you, the programmer, to determine the correct values of FRAMESIZE and OFFSET (actually, you'd need three different OFFSETs...). To learn about the correct choices, you should study the (System V) RISC-V psABI.

You'll also notice that @JamesKenneyImperas 's example sidesteps this problem by not using any saved registers.

Best, Nick Knight

EDIT: I'll give you a hint. For the prologue code, you could do:

    addi sp, sp, -16
    sd s0, 0(sp)
    sd s1, 4(sp)
    sd s2, 8(sp)

and for the epilogue code, you could do:

exit:
    ld s0, 0(sp)
    ld s1, 4(sp)
    ld s2, 8(sp)
    addi sp, sp, 16
    ret

EDIT 2: I'll leave it as a challenge to you to figure out how to deal with the OFFSET in the line

    # Convert elements strides to byte strides.
    ld cstride, OFFSET(sp)   # Get arg from stack frame