clMathLibraries / clBLAS

a software library containing BLAS functions written in OpenCL
Apache License 2.0
843 stars 237 forks source link

test-short failure on gfx1010 (RX5700 XT) #351

Open robinchrist opened 4 years ago

robinchrist commented 4 years ago
========================================================

AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = gfx1010
error = -11
memory pattern = Cached global memory based trmm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 8, dims[0].y = 32, dims[0].x = 8, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ; 
Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64
Kernel extra flags: 942700465
Source:

typedef union GPtr {
    __global float *f;
    __global float2 *f2v;
    __global float4 *f4v;
    __global float8 *f8v;
    __global float16 *f16v;
} GPtr;

typedef union LPtr {
    __local float *f;
    __local float2 *f2v;
    __local float4 *f4v;
    __local float8 *f8v;
    __local float16 *f16v;
} LPtr;

typedef union PPtr {
    float *f;
    float2 *f2v;
    float4 *f4v;
    float8 *f8v;
    float16 *f16v;
} PPtr;

__attribute__((reqd_work_group_size(64, 1, 1)))
void __kernel
strmmSubgroup(
    uint M,
    uint N,
    float alpha,
    const __global float *restrict A,
    uint lda,
    const __global float *restrict B,
    __global float *C,
    uint ldb)
{
    float8 a0, a1, a2, a3;
    float8 b0, b1, b2, b3;
    float4 c0, c1, c2, c3;
    uint currM, currN;
    uint4 coord = 0; /* contains coordB, coordA, k */
    const int lid = get_local_id(0);
    const int gid = get_global_id(0) / 64;

    int2 itemId;
    int2 subgCoord;
    itemId.x = get_local_id(0)%4;
    itemId.y = get_local_id(0)/4;
    subgCoord.x = itemId.y/8;
    subgCoord.y = itemId.y%8;

    currN = gid * 8;
    currM = (M - 1) / 32 * 32;

    GPtr Ag = {A};
    GPtr Bg = {B};
    coord.x = currN + subgCoord.x*4;
    for (uint m0 = 0; m0 < M; m0 += 32) {
        uint kBegin = 0;
        coord.z = kBegin;
        coord.y = currM + subgCoord.y*4;
        c0 = 0;
        c1 = 0;
        c2 = 0;
        c3 = 0;

        if ((coord.x < N) && (coord.y < M)) {
            if (coord.y + 4 > M) {
                coord.y -= 4 - M % 4;
            }
            if (coord.x + 4 > N) {
                coord.x -= 4 - N % 4;
            }

            uint k0;
            uint kMax;
            kMax = currM - currM%8;
            for( k0 = 0; k0 < kMax; k0 += 32 ) {
                coord.z=(k0+itemId.x*8+64*gid)%kMax;
                /* -- Tiles multiplier -- */
                const uint4 bx = {mad24(coord.x % N, ldb, 0u), mad24((coord.x + 1) % N, ldb, 0u), mad24((coord.x + 2) % N, ldb, 0u),
                                mad24((coord.x + 3) % N, ldb, 0u)};
                const uint8 bk = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M;

                b0.s0 = (Bg).f[bx.s0 + bk.s0];
                b0.s1 = (Bg).f[bx.s0 + bk.s1];
                b0.s2 = (Bg).f[bx.s0 + bk.s2];
                b0.s3 = (Bg).f[bx.s0 + bk.s3];
                b0.s4 = (Bg).f[bx.s0 + bk.s4];
                b0.s5 = (Bg).f[bx.s0 + bk.s5];
                b0.s6 = (Bg).f[bx.s0 + bk.s6];
                b0.s7 = (Bg).f[bx.s0 + bk.s7];
                b1.s0 = (Bg).f[bx.s1 + bk.s0];
                b1.s1 = (Bg).f[bx.s1 + bk.s1];
                b1.s2 = (Bg).f[bx.s1 + bk.s2];
                b1.s3 = (Bg).f[bx.s1 + bk.s3];
                b1.s4 = (Bg).f[bx.s1 + bk.s4];
                b1.s5 = (Bg).f[bx.s1 + bk.s5];
                b1.s6 = (Bg).f[bx.s1 + bk.s6];
                b1.s7 = (Bg).f[bx.s1 + bk.s7];
                b2.s0 = (Bg).f[bx.s2 + bk.s0];
                b2.s1 = (Bg).f[bx.s2 + bk.s1];
                b2.s2 = (Bg).f[bx.s2 + bk.s2];
                b2.s3 = (Bg).f[bx.s2 + bk.s3];
                b2.s4 = (Bg).f[bx.s2 + bk.s4];
                b2.s5 = (Bg).f[bx.s2 + bk.s5];
                b2.s6 = (Bg).f[bx.s2 + bk.s6];
                b2.s7 = (Bg).f[bx.s2 + bk.s7];
                b3.s0 = (Bg).f[bx.s3 + bk.s0];
                b3.s1 = (Bg).f[bx.s3 + bk.s1];
                b3.s2 = (Bg).f[bx.s3 + bk.s2];
                b3.s3 = (Bg).f[bx.s3 + bk.s3];
                b3.s4 = (Bg).f[bx.s3 + bk.s4];
                b3.s5 = (Bg).f[bx.s3 + bk.s5];
                b3.s6 = (Bg).f[bx.s3 + bk.s6];
                b3.s7 = (Bg).f[bx.s3 + bk.s7];

                const uint4 ay = {mad24(coord.y % M, lda, 0u), mad24((coord.y + 1) % M, lda, 0u), mad24((coord.y + 2) % M, lda, 0u),
                                mad24((coord.y + 3) % M, lda, 0u)};
                const uint8 ak = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M;

                a0.s0 = (Ag).f[ay.s0 + ak.s0];
                a0.s1 = (Ag).f[ay.s0 + ak.s1];
                a0.s2 = (Ag).f[ay.s0 + ak.s2];
                a0.s3 = (Ag).f[ay.s0 + ak.s3];
                a0.s4 = (Ag).f[ay.s0 + ak.s4];
                a0.s5 = (Ag).f[ay.s0 + ak.s5];
                a0.s6 = (Ag).f[ay.s0 + ak.s6];
                a0.s7 = (Ag).f[ay.s0 + ak.s7];
                a1.s0 = (Ag).f[ay.s1 + ak.s0];
                a1.s1 = (Ag).f[ay.s1 + ak.s1];
                a1.s2 = (Ag).f[ay.s1 + ak.s2];
                a1.s3 = (Ag).f[ay.s1 + ak.s3];
                a1.s4 = (Ag).f[ay.s1 + ak.s4];
                a1.s5 = (Ag).f[ay.s1 + ak.s5];
                a1.s6 = (Ag).f[ay.s1 + ak.s6];
                a1.s7 = (Ag).f[ay.s1 + ak.s7];
                a2.s0 = (Ag).f[ay.s2 + ak.s0];
                a2.s1 = (Ag).f[ay.s2 + ak.s1];
                a2.s2 = (Ag).f[ay.s2 + ak.s2];
                a2.s3 = (Ag).f[ay.s2 + ak.s3];
                a2.s4 = (Ag).f[ay.s2 + ak.s4];
                a2.s5 = (Ag).f[ay.s2 + ak.s5];
                a2.s6 = (Ag).f[ay.s2 + ak.s6];
                a2.s7 = (Ag).f[ay.s2 + ak.s7];
                a3.s0 = (Ag).f[ay.s3 + ak.s0];
                a3.s1 = (Ag).f[ay.s3 + ak.s1];
                a3.s2 = (Ag).f[ay.s3 + ak.s2];
                a3.s3 = (Ag).f[ay.s3 + ak.s3];
                a3.s4 = (Ag).f[ay.s3 + ak.s4];
                a3.s5 = (Ag).f[ay.s3 + ak.s5];
                a3.s6 = (Ag).f[ay.s3 + ak.s6];
                a3.s7 = (Ag).f[ay.s3 + ak.s7];

                c0.s0 = mad(a0.s0, b0.s0, c0.s0);
                c0.s0 = mad(a0.s1, b0.s1, c0.s0);
                c0.s0 = mad(a0.s2, b0.s2, c0.s0);
                c0.s0 = mad(a0.s3, b0.s3, c0.s0);
                c0.s0 = mad(a0.s4, b0.s4, c0.s0);
                c0.s0 = mad(a0.s5, b0.s5, c0.s0);
                c0.s0 = mad(a0.s6, b0.s6, c0.s0);
                c0.s0 = mad(a0.s7, b0.s7, c0.s0);
                c1.s0 = mad(a0.s0, b1.s0, c1.s0);
                c1.s0 = mad(a0.s1, b1.s1, c1.s0);
                c1.s0 = mad(a0.s2, b1.s2, c1.s0);
                c1.s0 = mad(a0.s3, b1.s3, c1.s0);
                c1.s0 = mad(a0.s4, b1.s4, c1.s0);
                c1.s0 = mad(a0.s5, b1.s5, c1.s0);
                c1.s0 = mad(a0.s6, b1.s6, c1.s0);
                c1.s0 = mad(a0.s7, b1.s7, c1.s0);
                c2.s0 = mad(a0.s0, b2.s0, c2.s0);
                c2.s0 = mad(a0.s1, b2.s1, c2.s0);
                c2.s0 = mad(a0.s2, b2.s2, c2.s0);
                c2.s0 = mad(a0.s3, b2.s3, c2.s0);
                c2.s0 = mad(a0.s4, b2.s4, c2.s0);
                c2.s0 = mad(a0.s5, b2.s5, c2.s0);
                c2.s0 = mad(a0.s6, b2.s6, c2.s0);
                c2.s0 = mad(a0.s7, b2.s7, c2.s0);
                c3.s0 = mad(a0.s0, b3.s0, c3.s0);
                c3.s0 = mad(a0.s1, b3.s1, c3.s0);
                c3.s0 = mad(a0.s2, b3.s2, c3.s0);
                c3.s0 = mad(a0.s3, b3.s3, c3.s0);
                c3.s0 = mad(a0.s4, b3.s4, c3.s0);
                c3.s0 = mad(a0.s5, b3.s5, c3.s0);
                c3.s0 = mad(a0.s6, b3.s6, c3.s0);
                c3.s0 = mad(a0.s7, b3.s7, c3.s0);

                c0.s1 = mad(a1.s0, b0.s0, c0.s1);
                c0.s1 = mad(a1.s1, b0.s1, c0.s1);
                c0.s1 = mad(a1.s2, b0.s2, c0.s1);
                c0.s1 = mad(a1.s3, b0.s3, c0.s1);
                c0.s1 = mad(a1.s4, b0.s4, c0.s1);
                c0.s1 = mad(a1.s5, b0.s5, c0.s1);
                c0.s1 = mad(a1.s6, b0.s6, c0.s1);
                c0.s1 = mad(a1.s7, b0.s7, c0.s1);
                c1.s1 = mad(a1.s0, b1.s0, c1.s1);
                c1.s1 = mad(a1.s1, b1.s1, c1.s1);
                c1.s1 = mad(a1.s2, b1.s2, c1.s1);
                c1.s1 = mad(a1.s3, b1.s3, c1.s1);
                c1.s1 = mad(a1.s4, b1.s4, c1.s1);
                c1.s1 = mad(a1.s5, b1.s5, c1.s1);
                c1.s1 = mad(a1.s6, b1.s6, c1.s1);
                c1.s1 = mad(a1.s7, b1.s7, c1.s1);
                c2.s1 = mad(a1.s0, b2.s0, c2.s1);
                c2.s1 = mad(a1.s1, b2.s1, c2.s1);
                c2.s1 = mad(a1.s2, b2.s2, c2.s1);
                c2.s1 = mad(a1.s3, b2.s3, c2.s1);
                c2.s1 = mad(a1.s4, b2.s4, c2.s1);
                c2.s1 = mad(a1.s5, b2.s5, c2.s1);
                c2.s1 = mad(a1.s6, b2.s6, c2.s1);
                c2.s1 = mad(a1.s7, b2.s7, c2.s1);
                c3.s1 = mad(a1.s0, b3.s0, c3.s1);
                c3.s1 = mad(a1.s1, b3.s1, c3.s1);
                c3.s1 = mad(a1.s2, b3.s2, c3.s1);
                c3.s1 = mad(a1.s3, b3.s3, c3.s1);
                c3.s1 = mad(a1.s4, b3.s4, c3.s1);
                c3.s1 = mad(a1.s5, b3.s5, c3.s1);
                c3.s1 = mad(a1.s6, b3.s6, c3.s1);
                c3.s1 = mad(a1.s7, b3.s7, c3.s1);

                c0.s2 = mad(a2.s0, b0.s0, c0.s2);
                c0.s2 = mad(a2.s1, b0.s1, c0.s2);
                c0.s2 = mad(a2.s2, b0.s2, c0.s2);
                c0.s2 = mad(a2.s3, b0.s3, c0.s2);
                c0.s2 = mad(a2.s4, b0.s4, c0.s2);
                c0.s2 = mad(a2.s5, b0.s5, c0.s2);
                c0.s2 = mad(a2.s6, b0.s6, c0.s2);
                c0.s2 = mad(a2.s7, b0.s7, c0.s2);
                c1.s2 = mad(a2.s0, b1.s0, c1.s2);
                c1.s2 = mad(a2.s1, b1.s1, c1.s2);
                c1.s2 = mad(a2.s2, b1.s2, c1.s2);
                c1.s2 = mad(a2.s3, b1.s3, c1.s2);
                c1.s2 = mad(a2.s4, b1.s4, c1.s2);
                c1.s2 = mad(a2.s5, b1.s5, c1.s2);
                c1.s2 = mad(a2.s6, b1.s6, c1.s2);
                c1.s2 = mad(a2.s7, b1.s7, c1.s2);
                c2.s2 = mad(a2.s0, b2.s0, c2.s2);
                c2.s2 = mad(a2.s1, b2.s1, c2.s2);
                c2.s2 = mad(a2.s2, b2.s2, c2.s2);
                c2.s2 = mad(a2.s3, b2.s3, c2.s2);
                c2.s2 = mad(a2.s4, b2.s4, c2.s2);
                c2.s2 = mad(a2.s5, b2.s5, c2.s2);
                c2.s2 = mad(a2.s6, b2.s6, c2.s2);
                c2.s2 = mad(a2.s7, b2.s7, c2.s2);
                c3.s2 = mad(a2.s0, b3.s0, c3.s2);
                c3.s2 = mad(a2.s1, b3.s1, c3.s2);
                c3.s2 = mad(a2.s2, b3.s2, c3.s2);
                c3.s2 = mad(a2.s3, b3.s3, c3.s2);
                c3.s2 = mad(a2.s4, b3.s4, c3.s2);
                c3.s2 = mad(a2.s5, b3.s5, c3.s2);
                c3.s2 = mad(a2.s6, b3.s6, c3.s2);
                c3.s2 = mad(a2.s7, b3.s7, c3.s2);

                c0.s3 = mad(a3.s0, b0.s0, c0.s3);
                c0.s3 = mad(a3.s1, b0.s1, c0.s3);
                c0.s3 = mad(a3.s2, b0.s2, c0.s3);
                c0.s3 = mad(a3.s3, b0.s3, c0.s3);
                c0.s3 = mad(a3.s4, b0.s4, c0.s3);
                c0.s3 = mad(a3.s5, b0.s5, c0.s3);
                c0.s3 = mad(a3.s6, b0.s6, c0.s3);
                c0.s3 = mad(a3.s7, b0.s7, c0.s3);
                c1.s3 = mad(a3.s0, b1.s0, c1.s3);
                c1.s3 = mad(a3.s1, b1.s1, c1.s3);
                c1.s3 = mad(a3.s2, b1.s2, c1.s3);
                c1.s3 = mad(a3.s3, b1.s3, c1.s3);
                c1.s3 = mad(a3.s4, b1.s4, c1.s3);
                c1.s3 = mad(a3.s5, b1.s5, c1.s3);
                c1.s3 = mad(a3.s6, b1.s6, c1.s3);
                c1.s3 = mad(a3.s7, b1.s7, c1.s3);
                c2.s3 = mad(a3.s0, b2.s0, c2.s3);
                c2.s3 = mad(a3.s1, b2.s1, c2.s3);
                c2.s3 = mad(a3.s2, b2.s2, c2.s3);
                c2.s3 = mad(a3.s3, b2.s3, c2.s3);
                c2.s3 = mad(a3.s4, b2.s4, c2.s3);
                c2.s3 = mad(a3.s5, b2.s5, c2.s3);
                c2.s3 = mad(a3.s6, b2.s6, c2.s3);
                c2.s3 = mad(a3.s7, b2.s7, c2.s3);
                c3.s3 = mad(a3.s0, b3.s0, c3.s3);
                c3.s3 = mad(a3.s1, b3.s1, c3.s3);
                c3.s3 = mad(a3.s2, b3.s2, c3.s3);
                c3.s3 = mad(a3.s3, b3.s3, c3.s3);
                c3.s3 = mad(a3.s4, b3.s4, c3.s3);
                c3.s3 = mad(a3.s5, b3.s5, c3.s3);
                c3.s3 = mad(a3.s6, b3.s6, c3.s3);
                c3.s3 = mad(a3.s7, b3.s7, c3.s3);
                /* ---------------------- */
            }
            if( itemId.x == 0 ) {
                for( k0 = kMax; (k0 < currM+32)&&(k0 < M); k0 += 1 ) {
                    coord.z=k0;
                    /* -- Tiles multiplier -- */
                    const uint bk = coord.z % M;

                    b0.s0 = (Bg).f[mad24(coord.x % N, ldb, bk)];
                    b1.s0 = (Bg).f[mad24((coord.x + 1) % N, ldb, bk)];
                    b2.s0 = (Bg).f[mad24((coord.x + 2) % N, ldb, bk)];
                    b3.s0 = (Bg).f[mad24((coord.x + 3) % N, ldb, bk)];

                    b0.s0 = (coord.z < M) ? b0.s0 : 0;
                    b1.s0 = (coord.z < M) ? b1.s0 : 0;
                    b2.s0 = (coord.z < M) ? b2.s0 : 0;
                    b3.s0 = (coord.z < M) ? b3.s0 : 0;

                    const uint ak = coord.z % M;

                    a0.s0 = (Ag).f[mad24(coord.y % M, lda, ak)];
                    a1.s0 = (Ag).f[mad24((coord.y + 1) % M, lda, ak)];
                    a2.s0 = (Ag).f[mad24((coord.y + 2) % M, lda, ak)];
                    a3.s0 = (Ag).f[mad24((coord.y + 3) % M, lda, ak)];

                    a0.s0 = (coord.z < M) ? a0.s0 : 0;
                    a1.s0 = (coord.z < M) ? a1.s0 : 0;
                    a2.s0 = (coord.z < M) ? a2.s0 : 0;
                    a3.s0 = (coord.z < M) ? a3.s0 : 0;
                    // post fetch A
                    {
                        uint zy = coord.y;
                        a0.s0 = zy < coord.z ? 0 : a0.s0;
                        a0.s0 = zy == coord.z ? 1 : a0.s0;
                        zy++;
                        a1.s0 = zy < coord.z ? 0 : a1.s0;
                        a1.s0 = zy == coord.z ? 1 : a1.s0;
                        zy++;
                        a2.s0 = zy < coord.z ? 0 : a2.s0;
                        a2.s0 = zy == coord.z ? 1 : a2.s0;
                        zy++;
                        a3.s0 = zy < coord.z ? 0 : a3.s0;
                        a3.s0 = zy == coord.z ? 1 : a3.s0;
                    }

                    c0.s0 = mad(a0.s0, b0.s0, c0.s0);
                    c1.s0 = mad(a0.s0, b1.s0, c1.s0);
                    c2.s0 = mad(a0.s0, b2.s0, c2.s0);
                    c3.s0 = mad(a0.s0, b3.s0, c3.s0);

                    c0.s1 = mad(a1.s0, b0.s0, c0.s1);
                    c1.s1 = mad(a1.s0, b1.s0, c1.s1);
                    c2.s1 = mad(a1.s0, b2.s0, c2.s1);
                    c3.s1 = mad(a1.s0, b3.s0, c3.s1);

                    c0.s2 = mad(a2.s0, b0.s0, c0.s2);
                    c1.s2 = mad(a2.s0, b1.s0, c1.s2);
                    c2.s2 = mad(a2.s0, b2.s0, c2.s2);
                    c3.s2 = mad(a2.s0, b3.s0, c3.s2);

                    c0.s3 = mad(a3.s0, b0.s0, c0.s3);
                    c1.s3 = mad(a3.s0, b1.s0, c1.s3);
                    c2.s3 = mad(a3.s0, b2.s0, c2.s3);
                    c3.s3 = mad(a3.s0, b3.s0, c3.s3);
                    /* ---------------------- */
                }
            }
        }
        barrier(CLK_GLOBAL_MEM_FENCE);
        if ((coord.y + 4 == M) && (M % 4)) {
            coord.y += 4 - M % 4;
        }
        if ((coord.x + 4 == N) && (N % 4)) {
            coord.x += 4 - N % 4;
        }

        //-----MergeUpdateResult

        // veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC]
        __local float4 ascratch[4*16*4];
        __local float4 *scratch = ascratch;

        //LDS block has the same vectorization as C matrix block
        //VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS +get_local_id(0) );
        scratch += 4*((itemId.y%16)*4 +itemId.x );

        for( uint mstep = 0; mstep < 16; mstep += 16 ) {

            if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) {

                scratch[0] = c0;
                scratch[1] = c1;
                scratch[2] = c2;
                scratch[3] = c3;

                c0 = 0;
                c1 = 0;
                c2 = 0;
                c3 = 0;

            }

            barrier(CLK_LOCAL_MEM_FENCE);

            if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) {
                if ( 0 == itemId.x ) {

                    for(uint k = 0; k < 4 * 4; k += 4) {

                        c0 += scratch[0];
                        c1 += scratch[1];
                        c2 += scratch[2];
                        c3 += scratch[3];

                        //Adding the LDS block size in vectors
                        scratch += 4;
                    }

                    if ((coord.y < M) && (coord.x < N)) {
                        uint y = min(4u, M - (uint)coord.y);
                        uint x = min(4u, N - (uint)coord.x);
                        if ((y == 4) && (x == 4)) {
                            GPtr uC;

                            uC.f = C + coord.x * ldb + coord.y;

                            __global float *pC = uC.f;

                            float4 tempC0, tempC1, tempC2, tempC3;

                            tempC0 = mad(c0, alpha, 0);
                            tempC1 = mad(c1, alpha, 0);
                            tempC2 = mad(c2, alpha, 0);
                            tempC3 = mad(c3, alpha, 0);
                            pC[0] = tempC0.s0;
                            pC[1] = tempC0.s1;
                            pC[2] = tempC0.s2;
                            pC[3] = tempC0.s3;
                            pC[ldb] = tempC1.s0;
                            pC[ldb + 1] = tempC1.s1;
                            pC[ldb + 2] = tempC1.s2;
                            pC[ldb + 3] = tempC1.s3;
                            pC[(ldb << 1)] = tempC2.s0;
                            pC[mad24(2u, ldb, 1u)] = tempC2.s1;
                            pC[mad24(2u, ldb, 2u)] = tempC2.s2;
                            pC[mad24(2u, ldb, 3u)] = tempC2.s3;
                            pC[mad24(3u, ldb, 0u)] = tempC3.s0;
                            pC[mad24(3u, ldb, 1u)] = tempC3.s1;
                            pC[mad24(3u, ldb, 2u)] = tempC3.s2;
                            pC[mad24(3u, ldb, 3u)] = tempC3.s3;
                        }
                        else  {
                            GPtr uC;
                            int i, j;
                            PPtr res;

                            uC.f = C + coord.x * ldb + coord.y;

                            uC.f += (x-1) * ldb;

                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c3.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c3.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c3.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c3.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c2.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c2.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c2.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c2.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c1.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c1.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c1.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c1.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c0.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c0.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c0.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c0.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                        }
                    }

                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        currM -= 32;
    }
}

--------------------------------------------------------

Build log:

/tmp/comgr-a1a18b/input/CompileCLSource:56:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers
    GPtr Ag = {A};
               ^
/tmp/comgr-a1a18b/input/CompileCLSource:57:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers
    GPtr Bg = {B};
               ^
/tmp/comgr-a1a18b/input/CompileCLSource:366:24: error: variables in the local address space can only be declared in the outermost scope of a kernel function
        __local float4 ascratch[4*16*4];
                       ^
2 warnings and 1 error generated.
Error: Failed to compile opencl source (from CL to LLVM IR).

========================================================

Segmentation fault (core dumped)

Additionally, several tests fail, e.g.

[ RUN      ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39
m : 6    n: 63
/home/robin/dev/clBLAS/src/tests/include/matrix.h:472: Failure
The difference between ((ref).s[0]) and ((clresult).s[0]) is 58230133, which exceeds delta, where
((ref).s[0]) evaluates to -270497230451976,
((clresult).s[0]) evaluates to -270497288682109, and
delta evaluates to 0.
clblasColumnMajor, clblasTrans, clblasTrans, M = 128, N = 128, K = 128, offA = 1, offB = 0, offC = 0, lda = 500, ldb = 501, ldc = 502
             seed = 12345, queues = 1, [  FAILED  ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39, where GetParam() = (1, 1, 1, 128, 128, 128, 48-byte object <F4-01 00-00 00-00 00-00 F5-01 00-00 00-00 00-00 F6-01 00-00 00-00 00-00 01-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00>, 1) (9 ms)