clMathLibraries / clBLAS

a software library containing BLAS functions written in OpenCL
Apache License 2.0
839 stars 240 forks source link

blas 3 matrix multiplication kernel build error when called from ArrayFire #59

Closed Kerkil closed 9 years ago

Kerkil commented 9 years ago

I am indirectly calling the clBLAS gemm function from ArrayFire. I found that ArrayFire's matmul function crashes when it calls clBLAS's gemmS function. So, I activated build error log in clBLAS ( and noticed there is build error in clBLAS.

When I run the matrix multiplication example in clBLAS repo directly (, everything works just fine.

What could be an issue here? (I am posting this as an ArrayFire issue as well.)


device name = HD Graphics 4000
error = 0
memory pattern = Cached global memory based block gemm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 8, dims[0].itemX = 8, dims[0].y = 8, dims[0].x = 8, dims[0].bwidth = 1; ; dims[1].itemY = 1, dims[1].itemX = 1, dims[1].y = 1, dims[1].x = 1, dims[1].bwidth = 1; ; 
Parallelism granularity: pgran->wgDim = 2, pgran->wgSize[0] = 8, pgran->wgSize[1] = 8, pgran->wfSize = 64
Kernel extra flags: 31508

typedef union GPtr {
    __global float *f;
    __global float2 *f2v;
    __global float4 *f4v;
    __global float8 *f8v;
    __global float16 *f16v;
} GPtr;

typedef union LPtr {
    __local float *f;
    __local float2 *f2v;
    __local float4 *f4v;
    __local float8 *f8v;
    __local float16 *f16v;
} LPtr;

typedef union PPtr {
    float *f;
    float2 *f2v;
    float4 *f4v;
    float8 *f8v;
    float16 *f16v;
} PPtr;

__attribute__((reqd_work_group_size(8, 8, 1)))
void __kernel
    uint M,
    uint N,
    uint K,
    const float alpha,
    const float beta,
    const __global float *restrict A,
    const __global float *restrict B,
    __global float *C,
    uint lda,
    uint ldb,
    uint ldc)
    float a0;
    float b0;
    float c0;
    uint4 coord = 0u; /* contains coordB, coordA, k */

    uint kif;
    uint get_group_id_1;
    uint get_global_id_1;
    A += (uint)get_global_id(0);
    get_group_id_1 = (get_group_id(0) + get_group_id(1))% get_num_groups(1);
    get_global_id_1 = get_group_id_1 * get_local_size(1) + get_local_id(1);
    kif = (N % 512 != 0);
    get_global_id_1 = (kif*(uint)get_global_id(1)) + ((1-kif)*get_global_id_1);
    B += get_global_id_1;
    coord.y = 1u * (uint)get_global_id(0);
    coord.x = 1u * (uint)get_global_id_1;
    if ((coord.y >= M) || (coord.x >= N)) {

    c0 = 0;

    for (uint k1 = 0; k1 < K; k1 += 1) {
        /* -- Tiles multiplier -- */
        b0 = B[0];

        a0 = A[0];

        c0 += a0 * b0;

        A += lda;
        B += ldb;
        /* ---------------------- */

    GPtr uC;

    uC.f = C + coord.x * ldc + coord.y;

    __global float *pC = uC.f;

    float tempC0;

    tempC0 = c0 * alpha + 0;
    pC[0] = tempC0;


Build log:


The program has unexpectedly finished.
kerkilchoi commented 9 years ago

Never mind. Seems my mistake. Noticing the error value is 0.