kernel sgemmBlock does not compile

marcino239 commented 9 years ago

I'm trying to use clBLAS on ARM Mali GPU and I'm getting the error below. However if I amend the code by hand, the code compiles ok:

    uC.f8v = C + (coord.y * ldc + coord.x)/8;

I am not sure if this is done purposely, or it is ARM OpenCL compiler that is complaining. If you let me know where this code is generated, I can give you a hand to fix this issue.
Wrong code:

AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = Mali-T604
error = -11
memory pattern = Cached global memory based block gemm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 8, dims[0].itemX = 64, dims[0].y = 8, dims[0].x = 64, dims[0].bwidth = 4; ; dims[1].itemY = 1, dims[1].itemX = 8, dims[1].y = 1, dims[1].x = 8, dims[1].bwidth = 4; ; 
Parallelism granularity: pgran->wgDim = 2, pgran->wgSize[0] = 8, pgran->wgSize[1] = 8, pgran->wfSize = 64
Kernel extra flags: 2816
Source:

typedef union GPtr {
    __global float *f;
    __global float2 *f2v;
    __global float4 *f4v;
    __global float8 *f8v;
    __global float16 *f16v;
} GPtr;

typedef union LPtr {
    __local float *f;
    __local float2 *f2v;
    __local float4 *f4v;
    __local float8 *f8v;
    __local float16 *f16v;
} LPtr;

typedef union PPtr {
    float *f;
    float2 *f2v;
    float4 *f4v;
    float8 *f8v;
    float16 *f16v;
} PPtr;

__attribute__((reqd_work_group_size(8, 8, 1)))
void __kernel
sgemmBlock(
    uint M,
    uint N,
    uint K,
    const float alpha,
    const float beta,
    const __global float4 *restrict A,
    const __global float8 *restrict B,
    __global float8 *C,
    uint lda,
    uint ldb,
    uint ldc)
{
    float4 a0;
    float8 b0, b1, b2, b3;
    float8 c0;
    uint4 coord = 0u; /* contains coordB, coordA, k */

    lda /= 4;
    ldb /= 8;
    uint kif;
    uint get_group_id_0;
    uint get_global_id_0;
    A += 1u * (uint)get_global_id(1) * lda;
    get_group_id_0 = (get_group_id(0) + get_group_id(1))% get_num_groups(0);
    get_global_id_0 = get_group_id_0 * get_local_size(0) + get_local_id(0);
    kif = (N % 512 != 0);
    get_global_id_0 = (kif*(uint)get_global_id(0)) + ((1-kif)*get_global_id_0);
    B += get_global_id_0;
    coord.y = 1u * (uint)get_global_id(1);
    coord.x = 8u * (uint)get_global_id_0;
    if ((coord.y >= M) || (coord.x >= N)) {
        return;
    }

    c0 = 0;

    for (uint k1 = 0; k1 < K; k1 += 4) {
        /* -- Tiles multiplier -- */
        b0 = B[0];
        b1 = B[ldb];
        b2 = B[(ldb << 1)];
        b3 = B[mad24(3u, ldb, 0u)];

        a0 = A[0];

        c0 += b0 * a0.s0;
        c0 += b1 * a0.s1;
        c0 += b2 * a0.s2;
        c0 += b3 * a0.s3;

        A += 1;
        B += (ldb << 2);
        /* ---------------------- */
    }

    GPtr uC;

    uC.f = C + (coord.y * ldc + coord.x)/8;

    __global float8 *pC = uC.f8v;

    float8 tempC0;

    tempC0 = c0 * alpha + 0;
    pC[0] = tempC0;
}

--------------------------------------------------------

Build log:

<source>:86:10: error: assigning to 'float *' from incompatible type 'float8 *'
    uC.f = C + (coord.y * ldc + coord.x)/8;
         ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

TimmyLiu commented 9 years ago

Hi, thanks for testing clBLAS on ARM GPU. The problem you saw is looking like a issue between clBLAS and ARM OpenCL compiler. I am pretty sure the same code runs fine with AMD, Intel and nVidia's OpenCL compiler. I think the code generation of sgemmBlock() is done by calling blockGen() in /src/library/blas/gens/gemm.c . I am thinking one way of fixing would be eliminating the use of GPtr. A line looks like "__global float{1|2|4|8|16} = C + (coord.y * ldc + coord.x)/8; " should be sufficient. Of course we need to be careful because C could be float{1|2|4|8|16} in different cases.

marcino239 commented 9 years ago

Hi - thanks for the hint. I looked through the code and have noticed that amending the blas_kgen.c file gets my code further:

diff --git a/src/library/blas/gens/blas_kgen.c b/src/library/blas/gens/blas_kgen.c
index 595fe10..81a5fec 100644
--- a/src/library/blas/gens/blas_kgen.c
+++ b/src/library/blas/gens/blas_kgen.c
@@ -1173,8 +1173,9 @@ updateResultGen(
         if ( vecLen > 1 ) {

             sprintf(tmp,
-                "uC.%s = %s + (%s * %s + %s)/%d;\n",
+                "uC.%s%dv = %s + (%s * %s + %s)/%d;\n",
                 vfield,
+                vecLen,
                 uvars.result,
                 uvars.startCol,
                 uvars.ld,
@@ -1197,8 +1198,9 @@ updateResultGen(
         if ( vecLen > 1 ) {

             sprintf(tmp,
-                "uC.%s = %s + (%s * %s + %s)/%d;\n",
+                "uC.%s%dv = %s + (%s * %s + %s)/%d;\n",
                 vfield,
+                vecLen,
                 uvars.result,
                 uvars.startRow,
                 uvars.ld,

I think the original code might have triggered a compilation error in ARM OpenCL compiler. If you think the fix won't impact other implementations (I don't have AMD GPU unfortunately) then let me know and I will create a pull request.

marcino239 commented 9 years ago

One more thing. Looks like now the blas code stops on this line:

<source>:92:51: error: Cannot downconvert and widen scalar type 'unsigned long' to vector type 'uint4'
            const uint4 bk = ((uint4)(0, 1, 2, 3) + (( get_group_id(0)*0 + k ) >> 1)) % vKB;
                              ~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

I would appreciate the hint

AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = Mali-T604
error = -11
memory pattern = Cached global memory based subgroup gemm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 16, dims[0].itemX = 8, dims[0].y = 16, dims[0].x = 8, dims[0].bwidth = 64; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ; 
Parallelism granularity: pgran->wgDim = 2, pgran->wgSize[0] = 8, pgran->wgSize[1] = 8, pgran->wfSize = 64
Kernel extra flags: 536887044
Source:

#define A_BLK_H 4
#define B_BLK_H 4
#define SUBG_ITEMS 8
#define SUBG_A 4
#define SUBG_B 2

#define SUBG_B 2
#define K_VLEN_A 2
#define K_VLEN_B 2

typedef union GPtr {
    __global float *f;
    __global float2 *f2v;
    __global float4 *f4v;
    __global float8 *f8v;
    __global float16 *f16v;
} GPtr;

typedef union LPtr {
    __local float *f;
    __local float2 *f2v;
    __local float4 *f4v;
    __local float8 *f8v;
    __local float16 *f16v;
} LPtr;

typedef union PPtr {
    float *f;
    float2 *f2v;
    float4 *f4v;
    float8 *f8v;
    float16 *f16v;
} PPtr;

__attribute__((reqd_work_group_size(8, 8, 1)))
void __kernel
sgemmSubgroup(
    uint M,
    uint N,
    uint K,
    const float alpha,
    const float beta,
    const __global float2 *restrict A,
    const __global float2 *restrict B,
    __global float4 *C,
    uint lda,
    uint ldb,
    uint ldc)
{
    uint Ktail = K % 64;
    uint Kbase = K - Ktail;
    float8 a0, a1, a2, a3;
    float8 b0, b1, b2, b3;
    float4 c0, c1, c2, c3;
    lda /= K_VLEN_A;
    ldb /= K_VLEN_B;

    int2 itemId;
    itemId.x = get_local_id(0);
    itemId.y = get_local_id(1);

    int coordY = A_BLK_H*( get_group_id(1)*SUBG_A + get_local_id(1)/SUBG_B );
    int coordX = B_BLK_H*( get_group_id(0)*SUBG_B + get_local_id(1)%SUBG_B );

    uint skipTileMul = 0;
    //M block tail
    if( coordY >= M ) {
        skipTileMul = 1;
    }
    //N block tail
    if( coordX >= N ) {
        skipTileMul = 1;
    }

    A += lda*coordY;
    B += ldb*coordX;

    c0 = 0;
    c1 = 0;
    c2 = 0;
    c3 = 0;

    const uint vKB = (Kbase >> 1);

    if( !skipTileMul ) {
        for(int k = 8*get_local_id(0); k < Kbase; k += 8*SUBG_ITEMS) {
            /* -- Tiles multiplier -- */
            float8 sum;
            const uint4 bk = ((uint4)(0, 1, 2, 3) + (( get_group_id(0)*0 + k ) >> 1)) % vKB;

TimmyLiu commented 9 years ago

Hi I ran into some test fails in c/zgemm with your code. You can see them by running the google test with command "./test-short --gtest_filter=gemm". Can you take a look and see if you can fix it? I will take a look for the bug in sgemmsubgroup later.

vejja commented 9 years ago

I have a very similar issue on Mac OS X 10.10 when trying to perform SGEMM. I am able to say that the crash only occurs when using the transposition feature of the function. i.e. This crashes :

error = clblasSgemm(clblasRowMajor, clblasNoTrans, clblasTrans, X.nb_rows, Y.nb_rows, X.nb_columns,
                        a, X.gpu_buffer, 0, X.nb_columns,
                        Y.gpu_buffer, 0, Y.nb_columns,
                        b, Z.gpu_buffer, 0, Z.nb_columns,
                        1, &this->command_queue, 0, NULL, &event);

I ran several tests and

the crash only occurs if you try to transpose a matrix by setting the clblasTrans flag
the crash only occurs when the transposed matrix is the second parameter (never if it is the first parameter)
the crash only occurs if the second matrix has more than 8 columns
the crash also occurs if you try to outsmart the function by setting the clblasColumnMajor mode and clBlasTrans on the first parameter, as below :

error = clblasSgemm(clblasColumnMajor, clblasTrans, clblasNoTrans, X.nb_columns, Y.nb_columns, X.nb_rows,
                        a, X.gpu_buffer, 0, X.nb_rows,
                        Y.gpu_buffer, 0, Y.nb_rows,
                        b, Z.gpu_buffer, 0, Z.nb_rows,
                        1, &this->command_queue, 0, NULL, &event);

For those running into the same issue as me and looking for a quick fix. You can get around it by first transposing the matrix, then running SGEMM with clBlasNoTrans, then transposing again the matrix. Hardly an efficient or satisfactory solution (given that BLAS has no in-place transposition function), but I guess it is good to know that this is not a truly blocking issue.

Cheers

========================================================

AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = HD Graphics 5000
error = -11
memory pattern = Cached global memory based subgroup gemm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 16, dims[0].itemX = 8, dims[0].y = 16, dims[0].x = 8, dims[0].bwidth = 64; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ; 
Parallelism granularity: pgran->wgDim = 2, pgran->wgSize[0] = 8, pgran->wgSize[1] = 8, pgran->wfSize = 64
Kernel extra flags: 805338884
Source:

TRUNCATED FOR CLARITY

--------------------------------------------------------

Build log:

<program source>:97:51: error: Cannot downconvert and widen scalar type 'unsigned long' to vector type '__uint4'
            const uint4 bk = ((uint4)(0, 1, 2, 3) + (( get_group_id(0)*0 + k ) >> 1)) % vKB;
                              ~~~~~~~             ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
<program source>:270:20: warning: unused variable 'sum'
            float8 sum;
                   ^
<program source>:592:29: warning: unused variable 'i'
                        int i, j;
                            ^
<program source>:592:32: warning: unused variable 'j'
                        int i, j;
                               ^
<program source>:593:30: warning: unused variable 'res'
                        PPtr res;
                             ^
<program source>:68:16: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
    if( coordY >= M ) {
        ~~~~~~ ^  ~
<program source>:72:16: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
    if( coordX >= N ) {
        ~~~~~~ ^  ~
<program source>:77:20: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
    if (coordX + 4 > N) {
        ~~~~~~~~~~ ^ ~
<program source>:94:42: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
        for(int k = 8*get_local_id(0); k < Kbase; k += 8*SUBG_ITEMS) {
                                       ~ ^ ~~~~~
<program source>:511:21: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
    if ((coordX + 4 == N) && (N % 4)) {
         ~~~~~~~~~~ ^  ~
<program source>:527:23: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
        if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) {
             ~~~~~~~~ ^  ~~~~~
<program source>:527:44: warning: comparison of integers of different signs: 'int' and 'unsigned int'
        if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) {
                                  ~~~~~~~~ ^  ~~~~~~~
<program source>:543:23: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
        if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) {
             ~~~~~~~~ ^  ~~~~~
<program source>:543:44: warning: comparison of integers of different signs: 'int' and 'unsigned int'
        if( (itemId.y >= mstep)&&(itemId.y < (mstep+8)) ) {
                                  ~~~~~~~~ ^  ~~~~~~~
<program source>:557:29: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
                if ((coordY < M) && (coordX < N)) {
                     ~~~~~~ ^ ~
<program source>:557:45: warning: comparison of integers of different signs: 'int' and 'uint' (aka 'unsigned int')
                if ((coordY < M) && (coordX < N)) {
                                     ~~~~~~ ^ ~

========================================================

libc++abi.dylib: terminating with uncaught exception 
(lldb)

kknox commented 9 years ago

I believe that this issue is also closed when #62 closed. The two issues appeared to be related on different platforms

galassoj commented 9 years ago

Hi, I don't think this is really fixed. I am compiling on ARM and got the same error. I implemented marcino239's fix to src/library/blas/gens/blas_kgen.c (11/2/14 post) and it fixes it. Can that fix be put in the repo?

nstiurca commented 8 years ago

@galassoj +1 exactly same situation here. I tried with both v2.6 and master.

clMathLibraries / clBLAS

kernel sgemmBlock does not compile #53