Open robinchrist opened 4 years ago
======================================================== AN INTERNAL KERNEL BUILD ERROR OCCURRED! device name = gfx1010 error = -11 memory pattern = Cached global memory based trmm, computing kernel generator Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 8, dims[0].y = 32, dims[0].x = 8, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ; Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64 Kernel extra flags: 942700465 Source: typedef union GPtr { __global float *f; __global float2 *f2v; __global float4 *f4v; __global float8 *f8v; __global float16 *f16v; } GPtr; typedef union LPtr { __local float *f; __local float2 *f2v; __local float4 *f4v; __local float8 *f8v; __local float16 *f16v; } LPtr; typedef union PPtr { float *f; float2 *f2v; float4 *f4v; float8 *f8v; float16 *f16v; } PPtr; __attribute__((reqd_work_group_size(64, 1, 1))) void __kernel strmmSubgroup( uint M, uint N, float alpha, const __global float *restrict A, uint lda, const __global float *restrict B, __global float *C, uint ldb) { float8 a0, a1, a2, a3; float8 b0, b1, b2, b3; float4 c0, c1, c2, c3; uint currM, currN; uint4 coord = 0; /* contains coordB, coordA, k */ const int lid = get_local_id(0); const int gid = get_global_id(0) / 64; int2 itemId; int2 subgCoord; itemId.x = get_local_id(0)%4; itemId.y = get_local_id(0)/4; subgCoord.x = itemId.y/8; subgCoord.y = itemId.y%8; currN = gid * 8; currM = (M - 1) / 32 * 32; GPtr Ag = {A}; GPtr Bg = {B}; coord.x = currN + subgCoord.x*4; for (uint m0 = 0; m0 < M; m0 += 32) { uint kBegin = 0; coord.z = kBegin; coord.y = currM + subgCoord.y*4; c0 = 0; c1 = 0; c2 = 0; c3 = 0; if ((coord.x < N) && (coord.y < M)) { if (coord.y + 4 > M) { coord.y -= 4 - M % 4; } if (coord.x + 4 > N) { coord.x -= 4 - N % 4; } uint k0; uint kMax; kMax = currM - currM%8; for( k0 = 0; k0 < kMax; k0 += 32 ) { coord.z=(k0+itemId.x*8+64*gid)%kMax; /* -- Tiles multiplier -- */ const uint4 bx = {mad24(coord.x % N, ldb, 0u), mad24((coord.x + 1) % N, ldb, 0u), mad24((coord.x + 2) % N, ldb, 0u), mad24((coord.x + 3) % N, ldb, 0u)}; const uint8 bk = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M; b0.s0 = (Bg).f[bx.s0 + bk.s0]; b0.s1 = (Bg).f[bx.s0 + bk.s1]; b0.s2 = (Bg).f[bx.s0 + bk.s2]; b0.s3 = (Bg).f[bx.s0 + bk.s3]; b0.s4 = (Bg).f[bx.s0 + bk.s4]; b0.s5 = (Bg).f[bx.s0 + bk.s5]; b0.s6 = (Bg).f[bx.s0 + bk.s6]; b0.s7 = (Bg).f[bx.s0 + bk.s7]; b1.s0 = (Bg).f[bx.s1 + bk.s0]; b1.s1 = (Bg).f[bx.s1 + bk.s1]; b1.s2 = (Bg).f[bx.s1 + bk.s2]; b1.s3 = (Bg).f[bx.s1 + bk.s3]; b1.s4 = (Bg).f[bx.s1 + bk.s4]; b1.s5 = (Bg).f[bx.s1 + bk.s5]; b1.s6 = (Bg).f[bx.s1 + bk.s6]; b1.s7 = (Bg).f[bx.s1 + bk.s7]; b2.s0 = (Bg).f[bx.s2 + bk.s0]; b2.s1 = (Bg).f[bx.s2 + bk.s1]; b2.s2 = (Bg).f[bx.s2 + bk.s2]; b2.s3 = (Bg).f[bx.s2 + bk.s3]; b2.s4 = (Bg).f[bx.s2 + bk.s4]; b2.s5 = (Bg).f[bx.s2 + bk.s5]; b2.s6 = (Bg).f[bx.s2 + bk.s6]; b2.s7 = (Bg).f[bx.s2 + bk.s7]; b3.s0 = (Bg).f[bx.s3 + bk.s0]; b3.s1 = (Bg).f[bx.s3 + bk.s1]; b3.s2 = (Bg).f[bx.s3 + bk.s2]; b3.s3 = (Bg).f[bx.s3 + bk.s3]; b3.s4 = (Bg).f[bx.s3 + bk.s4]; b3.s5 = (Bg).f[bx.s3 + bk.s5]; b3.s6 = (Bg).f[bx.s3 + bk.s6]; b3.s7 = (Bg).f[bx.s3 + bk.s7]; const uint4 ay = {mad24(coord.y % M, lda, 0u), mad24((coord.y + 1) % M, lda, 0u), mad24((coord.y + 2) % M, lda, 0u), mad24((coord.y + 3) % M, lda, 0u)}; const uint8 ak = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M; a0.s0 = (Ag).f[ay.s0 + ak.s0]; a0.s1 = (Ag).f[ay.s0 + ak.s1]; a0.s2 = (Ag).f[ay.s0 + ak.s2]; a0.s3 = (Ag).f[ay.s0 + ak.s3]; a0.s4 = (Ag).f[ay.s0 + ak.s4]; a0.s5 = (Ag).f[ay.s0 + ak.s5]; a0.s6 = (Ag).f[ay.s0 + ak.s6]; a0.s7 = (Ag).f[ay.s0 + ak.s7]; a1.s0 = (Ag).f[ay.s1 + ak.s0]; a1.s1 = (Ag).f[ay.s1 + ak.s1]; a1.s2 = (Ag).f[ay.s1 + ak.s2]; a1.s3 = (Ag).f[ay.s1 + ak.s3]; a1.s4 = (Ag).f[ay.s1 + ak.s4]; a1.s5 = (Ag).f[ay.s1 + ak.s5]; a1.s6 = (Ag).f[ay.s1 + ak.s6]; a1.s7 = (Ag).f[ay.s1 + ak.s7]; a2.s0 = (Ag).f[ay.s2 + ak.s0]; a2.s1 = (Ag).f[ay.s2 + ak.s1]; a2.s2 = (Ag).f[ay.s2 + ak.s2]; a2.s3 = (Ag).f[ay.s2 + ak.s3]; a2.s4 = (Ag).f[ay.s2 + ak.s4]; a2.s5 = (Ag).f[ay.s2 + ak.s5]; a2.s6 = (Ag).f[ay.s2 + ak.s6]; a2.s7 = (Ag).f[ay.s2 + ak.s7]; a3.s0 = (Ag).f[ay.s3 + ak.s0]; a3.s1 = (Ag).f[ay.s3 + ak.s1]; a3.s2 = (Ag).f[ay.s3 + ak.s2]; a3.s3 = (Ag).f[ay.s3 + ak.s3]; a3.s4 = (Ag).f[ay.s3 + ak.s4]; a3.s5 = (Ag).f[ay.s3 + ak.s5]; a3.s6 = (Ag).f[ay.s3 + ak.s6]; a3.s7 = (Ag).f[ay.s3 + ak.s7]; c0.s0 = mad(a0.s0, b0.s0, c0.s0); c0.s0 = mad(a0.s1, b0.s1, c0.s0); c0.s0 = mad(a0.s2, b0.s2, c0.s0); c0.s0 = mad(a0.s3, b0.s3, c0.s0); c0.s0 = mad(a0.s4, b0.s4, c0.s0); c0.s0 = mad(a0.s5, b0.s5, c0.s0); c0.s0 = mad(a0.s6, b0.s6, c0.s0); c0.s0 = mad(a0.s7, b0.s7, c0.s0); c1.s0 = mad(a0.s0, b1.s0, c1.s0); c1.s0 = mad(a0.s1, b1.s1, c1.s0); c1.s0 = mad(a0.s2, b1.s2, c1.s0); c1.s0 = mad(a0.s3, b1.s3, c1.s0); c1.s0 = mad(a0.s4, b1.s4, c1.s0); c1.s0 = mad(a0.s5, b1.s5, c1.s0); c1.s0 = mad(a0.s6, b1.s6, c1.s0); c1.s0 = mad(a0.s7, b1.s7, c1.s0); c2.s0 = mad(a0.s0, b2.s0, c2.s0); c2.s0 = mad(a0.s1, b2.s1, c2.s0); c2.s0 = mad(a0.s2, b2.s2, c2.s0); c2.s0 = mad(a0.s3, b2.s3, c2.s0); c2.s0 = mad(a0.s4, b2.s4, c2.s0); c2.s0 = mad(a0.s5, b2.s5, c2.s0); c2.s0 = mad(a0.s6, b2.s6, c2.s0); c2.s0 = mad(a0.s7, b2.s7, c2.s0); c3.s0 = mad(a0.s0, b3.s0, c3.s0); c3.s0 = mad(a0.s1, b3.s1, c3.s0); c3.s0 = mad(a0.s2, b3.s2, c3.s0); c3.s0 = mad(a0.s3, b3.s3, c3.s0); c3.s0 = mad(a0.s4, b3.s4, c3.s0); c3.s0 = mad(a0.s5, b3.s5, c3.s0); c3.s0 = mad(a0.s6, b3.s6, c3.s0); c3.s0 = mad(a0.s7, b3.s7, c3.s0); c0.s1 = mad(a1.s0, b0.s0, c0.s1); c0.s1 = mad(a1.s1, b0.s1, c0.s1); c0.s1 = mad(a1.s2, b0.s2, c0.s1); c0.s1 = mad(a1.s3, b0.s3, c0.s1); c0.s1 = mad(a1.s4, b0.s4, c0.s1); c0.s1 = mad(a1.s5, b0.s5, c0.s1); c0.s1 = mad(a1.s6, b0.s6, c0.s1); c0.s1 = mad(a1.s7, b0.s7, c0.s1); c1.s1 = mad(a1.s0, b1.s0, c1.s1); c1.s1 = mad(a1.s1, b1.s1, c1.s1); c1.s1 = mad(a1.s2, b1.s2, c1.s1); c1.s1 = mad(a1.s3, b1.s3, c1.s1); c1.s1 = mad(a1.s4, b1.s4, c1.s1); c1.s1 = mad(a1.s5, b1.s5, c1.s1); c1.s1 = mad(a1.s6, b1.s6, c1.s1); c1.s1 = mad(a1.s7, b1.s7, c1.s1); c2.s1 = mad(a1.s0, b2.s0, c2.s1); c2.s1 = mad(a1.s1, b2.s1, c2.s1); c2.s1 = mad(a1.s2, b2.s2, c2.s1); c2.s1 = mad(a1.s3, b2.s3, c2.s1); c2.s1 = mad(a1.s4, b2.s4, c2.s1); c2.s1 = mad(a1.s5, b2.s5, c2.s1); c2.s1 = mad(a1.s6, b2.s6, c2.s1); c2.s1 = mad(a1.s7, b2.s7, c2.s1); c3.s1 = mad(a1.s0, b3.s0, c3.s1); c3.s1 = mad(a1.s1, b3.s1, c3.s1); c3.s1 = mad(a1.s2, b3.s2, c3.s1); c3.s1 = mad(a1.s3, b3.s3, c3.s1); c3.s1 = mad(a1.s4, b3.s4, c3.s1); c3.s1 = mad(a1.s5, b3.s5, c3.s1); c3.s1 = mad(a1.s6, b3.s6, c3.s1); c3.s1 = mad(a1.s7, b3.s7, c3.s1); c0.s2 = mad(a2.s0, b0.s0, c0.s2); c0.s2 = mad(a2.s1, b0.s1, c0.s2); c0.s2 = mad(a2.s2, b0.s2, c0.s2); c0.s2 = mad(a2.s3, b0.s3, c0.s2); c0.s2 = mad(a2.s4, b0.s4, c0.s2); c0.s2 = mad(a2.s5, b0.s5, c0.s2); c0.s2 = mad(a2.s6, b0.s6, c0.s2); c0.s2 = mad(a2.s7, b0.s7, c0.s2); c1.s2 = mad(a2.s0, b1.s0, c1.s2); c1.s2 = mad(a2.s1, b1.s1, c1.s2); c1.s2 = mad(a2.s2, b1.s2, c1.s2); c1.s2 = mad(a2.s3, b1.s3, c1.s2); c1.s2 = mad(a2.s4, b1.s4, c1.s2); c1.s2 = mad(a2.s5, b1.s5, c1.s2); c1.s2 = mad(a2.s6, b1.s6, c1.s2); c1.s2 = mad(a2.s7, b1.s7, c1.s2); c2.s2 = mad(a2.s0, b2.s0, c2.s2); c2.s2 = mad(a2.s1, b2.s1, c2.s2); c2.s2 = mad(a2.s2, b2.s2, c2.s2); c2.s2 = mad(a2.s3, b2.s3, c2.s2); c2.s2 = mad(a2.s4, b2.s4, c2.s2); c2.s2 = mad(a2.s5, b2.s5, c2.s2); c2.s2 = mad(a2.s6, b2.s6, c2.s2); c2.s2 = mad(a2.s7, b2.s7, c2.s2); c3.s2 = mad(a2.s0, b3.s0, c3.s2); c3.s2 = mad(a2.s1, b3.s1, c3.s2); c3.s2 = mad(a2.s2, b3.s2, c3.s2); c3.s2 = mad(a2.s3, b3.s3, c3.s2); c3.s2 = mad(a2.s4, b3.s4, c3.s2); c3.s2 = mad(a2.s5, b3.s5, c3.s2); c3.s2 = mad(a2.s6, b3.s6, c3.s2); c3.s2 = mad(a2.s7, b3.s7, c3.s2); c0.s3 = mad(a3.s0, b0.s0, c0.s3); c0.s3 = mad(a3.s1, b0.s1, c0.s3); c0.s3 = mad(a3.s2, b0.s2, c0.s3); c0.s3 = mad(a3.s3, b0.s3, c0.s3); c0.s3 = mad(a3.s4, b0.s4, c0.s3); c0.s3 = mad(a3.s5, b0.s5, c0.s3); c0.s3 = mad(a3.s6, b0.s6, c0.s3); c0.s3 = mad(a3.s7, b0.s7, c0.s3); c1.s3 = mad(a3.s0, b1.s0, c1.s3); c1.s3 = mad(a3.s1, b1.s1, c1.s3); c1.s3 = mad(a3.s2, b1.s2, c1.s3); c1.s3 = mad(a3.s3, b1.s3, c1.s3); c1.s3 = mad(a3.s4, b1.s4, c1.s3); c1.s3 = mad(a3.s5, b1.s5, c1.s3); c1.s3 = mad(a3.s6, b1.s6, c1.s3); c1.s3 = mad(a3.s7, b1.s7, c1.s3); c2.s3 = mad(a3.s0, b2.s0, c2.s3); c2.s3 = mad(a3.s1, b2.s1, c2.s3); c2.s3 = mad(a3.s2, b2.s2, c2.s3); c2.s3 = mad(a3.s3, b2.s3, c2.s3); c2.s3 = mad(a3.s4, b2.s4, c2.s3); c2.s3 = mad(a3.s5, b2.s5, c2.s3); c2.s3 = mad(a3.s6, b2.s6, c2.s3); c2.s3 = mad(a3.s7, b2.s7, c2.s3); c3.s3 = mad(a3.s0, b3.s0, c3.s3); c3.s3 = mad(a3.s1, b3.s1, c3.s3); c3.s3 = mad(a3.s2, b3.s2, c3.s3); c3.s3 = mad(a3.s3, b3.s3, c3.s3); c3.s3 = mad(a3.s4, b3.s4, c3.s3); c3.s3 = mad(a3.s5, b3.s5, c3.s3); c3.s3 = mad(a3.s6, b3.s6, c3.s3); c3.s3 = mad(a3.s7, b3.s7, c3.s3); /* ---------------------- */ } if( itemId.x == 0 ) { for( k0 = kMax; (k0 < currM+32)&&(k0 < M); k0 += 1 ) { coord.z=k0; /* -- Tiles multiplier -- */ const uint bk = coord.z % M; b0.s0 = (Bg).f[mad24(coord.x % N, ldb, bk)]; b1.s0 = (Bg).f[mad24((coord.x + 1) % N, ldb, bk)]; b2.s0 = (Bg).f[mad24((coord.x + 2) % N, ldb, bk)]; b3.s0 = (Bg).f[mad24((coord.x + 3) % N, ldb, bk)]; b0.s0 = (coord.z < M) ? b0.s0 : 0; b1.s0 = (coord.z < M) ? b1.s0 : 0; b2.s0 = (coord.z < M) ? b2.s0 : 0; b3.s0 = (coord.z < M) ? b3.s0 : 0; const uint ak = coord.z % M; a0.s0 = (Ag).f[mad24(coord.y % M, lda, ak)]; a1.s0 = (Ag).f[mad24((coord.y + 1) % M, lda, ak)]; a2.s0 = (Ag).f[mad24((coord.y + 2) % M, lda, ak)]; a3.s0 = (Ag).f[mad24((coord.y + 3) % M, lda, ak)]; a0.s0 = (coord.z < M) ? a0.s0 : 0; a1.s0 = (coord.z < M) ? a1.s0 : 0; a2.s0 = (coord.z < M) ? a2.s0 : 0; a3.s0 = (coord.z < M) ? a3.s0 : 0; // post fetch A { uint zy = coord.y; a0.s0 = zy < coord.z ? 0 : a0.s0; a0.s0 = zy == coord.z ? 1 : a0.s0; zy++; a1.s0 = zy < coord.z ? 0 : a1.s0; a1.s0 = zy == coord.z ? 1 : a1.s0; zy++; a2.s0 = zy < coord.z ? 0 : a2.s0; a2.s0 = zy == coord.z ? 1 : a2.s0; zy++; a3.s0 = zy < coord.z ? 0 : a3.s0; a3.s0 = zy == coord.z ? 1 : a3.s0; } c0.s0 = mad(a0.s0, b0.s0, c0.s0); c1.s0 = mad(a0.s0, b1.s0, c1.s0); c2.s0 = mad(a0.s0, b2.s0, c2.s0); c3.s0 = mad(a0.s0, b3.s0, c3.s0); c0.s1 = mad(a1.s0, b0.s0, c0.s1); c1.s1 = mad(a1.s0, b1.s0, c1.s1); c2.s1 = mad(a1.s0, b2.s0, c2.s1); c3.s1 = mad(a1.s0, b3.s0, c3.s1); c0.s2 = mad(a2.s0, b0.s0, c0.s2); c1.s2 = mad(a2.s0, b1.s0, c1.s2); c2.s2 = mad(a2.s0, b2.s0, c2.s2); c3.s2 = mad(a2.s0, b3.s0, c3.s2); c0.s3 = mad(a3.s0, b0.s0, c0.s3); c1.s3 = mad(a3.s0, b1.s0, c1.s3); c2.s3 = mad(a3.s0, b2.s0, c2.s3); c3.s3 = mad(a3.s0, b3.s0, c3.s3); /* ---------------------- */ } } } barrier(CLK_GLOBAL_MEM_FENCE); if ((coord.y + 4 == M) && (M % 4)) { coord.y += 4 - M % 4; } if ((coord.x + 4 == N) && (N % 4)) { coord.x += 4 - N % 4; } //-----MergeUpdateResult // veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC] __local float4 ascratch[4*16*4]; __local float4 *scratch = ascratch; //LDS block has the same vectorization as C matrix block //VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS +get_local_id(0) ); scratch += 4*((itemId.y%16)*4 +itemId.x ); for( uint mstep = 0; mstep < 16; mstep += 16 ) { if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) { scratch[0] = c0; scratch[1] = c1; scratch[2] = c2; scratch[3] = c3; c0 = 0; c1 = 0; c2 = 0; c3 = 0; } barrier(CLK_LOCAL_MEM_FENCE); if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) { if ( 0 == itemId.x ) { for(uint k = 0; k < 4 * 4; k += 4) { c0 += scratch[0]; c1 += scratch[1]; c2 += scratch[2]; c3 += scratch[3]; //Adding the LDS block size in vectors scratch += 4; } if ((coord.y < M) && (coord.x < N)) { uint y = min(4u, M - (uint)coord.y); uint x = min(4u, N - (uint)coord.x); if ((y == 4) && (x == 4)) { GPtr uC; uC.f = C + coord.x * ldb + coord.y; __global float *pC = uC.f; float4 tempC0, tempC1, tempC2, tempC3; tempC0 = mad(c0, alpha, 0); tempC1 = mad(c1, alpha, 0); tempC2 = mad(c2, alpha, 0); tempC3 = mad(c3, alpha, 0); pC[0] = tempC0.s0; pC[1] = tempC0.s1; pC[2] = tempC0.s2; pC[3] = tempC0.s3; pC[ldb] = tempC1.s0; pC[ldb + 1] = tempC1.s1; pC[ldb + 2] = tempC1.s2; pC[ldb + 3] = tempC1.s3; pC[(ldb << 1)] = tempC2.s0; pC[mad24(2u, ldb, 1u)] = tempC2.s1; pC[mad24(2u, ldb, 2u)] = tempC2.s2; pC[mad24(2u, ldb, 3u)] = tempC2.s3; pC[mad24(3u, ldb, 0u)] = tempC3.s0; pC[mad24(3u, ldb, 1u)] = tempC3.s1; pC[mad24(3u, ldb, 2u)] = tempC3.s2; pC[mad24(3u, ldb, 3u)] = tempC3.s3; } else { GPtr uC; int i, j; PPtr res; uC.f = C + coord.x * ldb + coord.y; uC.f += (x-1) * ldb; if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c3.s0 * alpha; case 3: uC.f[(y+1) % 4] = c3.s1 * alpha; case 2: uC.f[(y+2) % 4] = c3.s2 * alpha; case 1: uC.f[(y+3) % 4] = c3.s3 * alpha; } uC.f -= ldb; x--; } if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c2.s0 * alpha; case 3: uC.f[(y+1) % 4] = c2.s1 * alpha; case 2: uC.f[(y+2) % 4] = c2.s2 * alpha; case 1: uC.f[(y+3) % 4] = c2.s3 * alpha; } uC.f -= ldb; x--; } if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c1.s0 * alpha; case 3: uC.f[(y+1) % 4] = c1.s1 * alpha; case 2: uC.f[(y+2) % 4] = c1.s2 * alpha; case 1: uC.f[(y+3) % 4] = c1.s3 * alpha; } uC.f -= ldb; x--; } if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c0.s0 * alpha; case 3: uC.f[(y+1) % 4] = c0.s1 * alpha; case 2: uC.f[(y+2) % 4] = c0.s2 * alpha; case 1: uC.f[(y+3) % 4] = c0.s3 * alpha; } uC.f -= ldb; x--; } } } } } barrier(CLK_LOCAL_MEM_FENCE); } currM -= 32; } } -------------------------------------------------------- Build log: /tmp/comgr-a1a18b/input/CompileCLSource:56:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers GPtr Ag = {A}; ^ /tmp/comgr-a1a18b/input/CompileCLSource:57:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers GPtr Bg = {B}; ^ /tmp/comgr-a1a18b/input/CompileCLSource:366:24: error: variables in the local address space can only be declared in the outermost scope of a kernel function __local float4 ascratch[4*16*4]; ^ 2 warnings and 1 error generated. Error: Failed to compile opencl source (from CL to LLVM IR). ======================================================== Segmentation fault (core dumped)
Additionally, several tests fail, e.g.
[ RUN ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39 m : 6 n: 63 /home/robin/dev/clBLAS/src/tests/include/matrix.h:472: Failure The difference between ((ref).s[0]) and ((clresult).s[0]) is 58230133, which exceeds delta, where ((ref).s[0]) evaluates to -270497230451976, ((clresult).s[0]) evaluates to -270497288682109, and delta evaluates to 0. clblasColumnMajor, clblasTrans, clblasTrans, M = 128, N = 128, K = 128, offA = 1, offB = 0, offC = 0, lda = 500, ldb = 501, ldc = 502 seed = 12345, queues = 1, [ FAILED ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39, where GetParam() = (1, 1, 1, 128, 128, 128, 48-byte object <F4-01 00-00 00-00 00-00 F5-01 00-00 00-00 00-00 F6-01 00-00 00-00 00-00 01-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00>, 1) (9 ms)
Additionally, several tests fail, e.g.