Benchmark ACL and analyse its optimization strategy

Benchmark

// Ours GEMM, dont use ACL strategy
float4: 1024x1024x1024 0.259872 s  8.263633 GFLOPS
half4:  1024x1024x1024 0.145462 s 14.763193 GFLOPS
// ACL:
FP32:   1024x1024x1024 0.084823 s 25.3419 GFLOPS
FP16:   1024x1024x1024 0.039247 s 54.7705 GFLOPS
// OpenBLAS OMP 1 A72 
FP32:   1024x1024x1024 0.193891 s 11.075726 GFLOPS
FP16:    not support

The strategy ACL using and we dont have:

Matrix transpose for B and interleaving (according the comments in code: this operation will reshape A and make blocking ) for A;
Using native build-in functions: vload, vstore, fma, etc.

ACL GEMM

#if defined(COLS_B) && defined(ALPHA)
/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
 *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
 *
 * @attention The width of matrix B and the alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
 *
 * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
 * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
 * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
 * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
 * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
 * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
 * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
 * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
 * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
 * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
 * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
 * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
 * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
 * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
 * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
 */
__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
                                                         IMAGE_DECLARATION(src1),
                                                         IMAGE_DECLARATION(dst))
{
    /* src_addr.s0 = address of matrix A */
    /* src_addr.s1 = address of matrix B */

    /* Compute address for matrix A and B */
    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
                                                                        (src1_stride_y));

    /* Add offset_first_element_in_bytes */
    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

    /* Divide by 4 in order to get the src_addr in unit of float */
    src_addr = src_addr >> 2;

    /* Compute end row address for matrix B */
    int end_row_mtx_b = src_addr.s1 + COLS_B;

    /* Reset accumulators */
    float4 c00 = 0.0f;
    float4 c10 = 0.0f;
    float4 c20 = 0.0f;
    float4 c30 = 0.0f;

    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
    {
        /* Load values from matrix A (interleaved) and matrix B (transposed) */
        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

        c00 += (float4)a0.s0 * b0;
        c10 += (float4)a0.s1 * b0;
        c20 += (float4)a0.s2 * b0;
        c30 += (float4)a0.s3 * b0;

        /* Load values from matrix A (interleaved) and matrix B (transposed) */
        a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
        b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);

        c00 += (float4)a0.s0 * b0;
        c10 += (float4)a0.s1 * b0;
        c20 += (float4)a0.s2 * b0;
        c30 += (float4)a0.s3 * b0;
    }

    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
    {
        /* Load values from matrix A (interleaved) and matrix B (transposed) */
        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

        c00 += (float4)a0.s0 * b0;
        c10 += (float4)a0.s1 * b0;
        c20 += (float4)a0.s2 * b0;
        c30 += (float4)a0.s3 * b0;
    }

    /* Compute destination address */
    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

    /* Multiply by the weight of matrix product */
    c00 = c00 * (float4)ALPHA;
    c10 = c10 * (float4)ALPHA;
    c20 = c20 * (float4)ALPHA;
    c30 = c30 * (float4)ALPHA;

    /* Store 4x4 block */
    vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));
    vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));
    vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));
    vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));
}

`gemm_mm_interleaved_transposed_f32_midgard`

gemm performance using gemm_mm_interleaved_transposed_f32_midgard strategy https://github.com/ysh329/OpenCL-101/issues/23

other strategies

This issue follows strategy above named gemm_mm_interleaved_transposed_f32_midgard. Besides, ACL has other GEMM or GEVM implementations:

gemm_mm_interleaved_transposed_f32_bifrost: This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
gemm_mm_interleaved_transposed_f16: This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
gemm_mm_interleaved_transposed_qs8: This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision
gemm_mm_interleaved_transposed_qs16: This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision. Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
gemm_mm_floating_point: This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
gemm_mm_floating_point_f32_bifrost: This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
gemm_mm_floating_point_f32_bifrost_1000: This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
gemm_mm_qs8: This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
gemm_mm_qs16: This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
gemm_lc_vm_f32: This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

ysh329 / OpenCL-101

Benchmark ACL and analyse its optimization strategy #21

Benchmark

ACL GEMM

`gemm_mm_interleaved_transposed_f32_midgard`

other strategies