Strange problem about index of vector variable

Correct rate is strange: always is 0.0000, :cry:

mat_mult_vec1x2_continue

// TODO: Strange, I think it's right!
__kernel void mat_mult_vec1x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
    const int col = get_global_id(0);
    const int row = get_global_id(0);

    CL_ELEM_TYPE aa, bb1, bb2, cc = 0;

    for (int p = 0; p < K; p += 2) {
        aa = *(
                  (__global CL_ELEM_TYPE *)(a + row * M + p) 
              );

        bb1 = *( 
                  (__global CL_ELEM_TYPE *)(b + p * N + col) 
              );
        bb2 = *( 
                  (__global CL_ELEM_TYPE *)(b + (p+1) * N + col) 
              );
        cc.s0 += aa.s0 * bb1.s0 + aa.s1 * bb2.s0;
        cc.s1 += aa.s0 + bb1.s1 + aa.s1 * bb2.s1;
    }
    c[row * N + col] = cc.s0;
    c[row * N + (col+1)] = cc.s1;
}

mat_mult_vec2x2_continue

// TODO: Strange! I think it's right!
__kernel void mat_mult_vec2x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
    const int col = get_global_id(0);
    const int row = get_global_id(1);

    CL_ELEM_TYPE aa1, aa2,
                 bb1, bb2,
                 cc1 = 0, 
                 cc2 = 0;

    for (int p = 0; p < K; p+=2) {
        aa1 = *(
                   (__global CL_ELEM_TYPE *)(a + row * M + p)
               );
        aa2 = *(
                   (__global CL_ELEM_TYPE *)(a + (row+1) * M + p)
               );

        bb1 = *(
                   (__global CL_ELEM_TYPE *)(b + p * N + col)
               );
        bb2 = *(
                   (__global CL_ELEM_TYPE *)(b + (p+1) * N + col)
               );
        cc1 = (CL_ELEM_TYPE)
                  (aa1.s0*bb1.s0 + aa1.s1*bb2.s0,    aa1.s0*bb1.s1 + aa1.s1*bb2.s1);
        cc2 = (CL_ELEM_TYPE)
                  (aa2.s0*bb1.s0 + aa2.s1*bb2.s0,    aa2.s0*bb1.s1 + aa2.s1*bb2.s1);
    }
    //*(__global CL_ELEM_TYPE *)(c + row * N + col) = cc1.s0;         *(__global CL_ELEM_TYPE *)(c + row * N + (col+1)) = cc1.s1;
    //*(__global CL_ELEM_TYPE *)(c + (row+1) * N + col) = cc2.s0;     *(__global CL_ELEM_TYPE *)(c + (row+1) * N + (col+1)) = cc2.s0;

    //*(__global CL_ELEM_TYPE *)(c + (row+1) * N + col) = cc2;

    c[row * N + col] = cc1.s0;      c[row * N + (col+1)] = cc1.s1;
    c[(row+1)*N + col] = cc2.s0;    c[(row+1)*N + (col+1)] = cc2.s1;

}

Those bugs above were fixed. So careless I am! :rofl:

mat_mult_vec1x2_continue

Code

// No perf up! 1024x1024x1024 float
// float: naive: 0.59s this: 0.59s
__kernel void mat_mult_vec1x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
    const int col = get_global_id(0) << 1;
    const int row = get_global_id(1);

    CL_ELEM_TYPE aa, bb1, bb2, cc = (CL_ELEM_TYPE)(0.0f, 0.0f);

    for (int p = 0; p < K; p += 2) {
        aa = *(
                  (__global CL_ELEM_TYPE *)(a + row * K + p)
              );

        bb1 = *(
                  (__global CL_ELEM_TYPE *)(b + p * N + col)
              );
        bb2 = *(
                  (__global CL_ELEM_TYPE *)(b + (p+1) * N + col)
              );
        cc.s0 += aa.s0 * bb1.s0 + aa.s1 * bb2.s0;
        cc.s1 += aa.s0 * bb1.s1 + aa.s1 * bb2.s1;
    }
    c[row * N + col] = cc.s0;
    c[row * N + (col+1)] = cc.s1;
}

Result

$ taskset -c 4 ./matrixMultiplication 1024 1024 1024 ./vec2.cl mat_mult_vec1x2_continue 0 10 $[1024/2] 1024 1                                                                                      
============== INIT ==============
>>> [INFO] ELEM_TYPE_STR: float, sizeof(ELEM_TYPE): 4
>>> [INFO] CL_ELEM_TYPE_STR: float2, sizeof(CL_ELEM_TYPE): 8
>>> [WARN] ELEM_TYPE(float) size differs from CL_ELEM_TYPE(float2)
>>> [INFO] len_a: 1048576, len_b: 1048576, len_c: 1048576
>>> [INFO] data_size_a: 4194304, data_size_b: 4194304, data_size_c: 4194304

============== CPU RESULT ==============
>>> [INFO] 0 times CPU starting...
0        31.849833
>>> [INFO] skip first 1 time(s)
>>> [INFO] CPU 1024x1024x1024 nan s nan GFLOPS

============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec1x2_continue
>>> [INFO] global_work_size[3]: { 512, 1024, 1 }
>>> [WARN] global work size (524288) is smaller than task size (1048576)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec1x2_continue starting ...
0        0.591864
>>> [INFO] skip first 1 time(s)
1        0.567336
2        0.607514
3        0.654933
4        0.550752
5        0.618466
6        0.645612
7        0.640939
8        0.548702
9        0.575562
10       0.537694
>>> [INFO] CL_GPU 1024x1024x1024 0.594751 s 3.610727 GFLOPS

>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b

mat_mult_vec2x2_continue

Code

// perf up! 24%
// 1024x1024x1024
// naive float : 0.59s ; this float2: 0.45s
__kernel void mat_mult_vec2x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
    const int col = get_global_id(0) << 1;
    const int row = get_global_id(1) << 1;

    CL_ELEM_TYPE aa1, aa2,
                 bb1, bb2,
                 cc1 = 0,
                 cc2 = 0;

    for (int p = 0; p < K; p+=2) {
        aa1 = *(
                   (__global CL_ELEM_TYPE *)(a + row * K + p)
               );
        aa2 = *(
                   (__global CL_ELEM_TYPE *)(a + (row+1) * K + p)
               );

        bb1 = *(
                   (__global CL_ELEM_TYPE *)(b + p * N + col)
               );
        bb2 = *(
                   (__global CL_ELEM_TYPE *)(b + (p+1) * N + col)
               );

        cc1.s0 += aa1.s0 * bb1.s0 + aa1.s1 * bb2.s0;
        cc1.s1 += aa1.s0 * bb1.s1 + aa1.s1 * bb2.s1;
        cc2.s0 += aa2.s0 * bb1.s0 + aa2.s1 * bb2.s0;
        cc2.s1 += aa2.s0 * bb1.s1 + aa2.s1 * bb2.s1;
    }

    c[row * N + col] = cc1.s0;        c[row * N + (col+1)] = cc1.s1;
    c[(row+1) * N + col] = cc2.s0;    c[(row+1) * N + (col+1)] = cc2.s1;

}

Result

$ ./matrixMultiplication 1024 1024 1024 ./vec2.cl mat_mult_vec2x2_continue 0 10 $[1024/2] $[1024/2] 1
============== INIT ==============
>>> [INFO] ELEM_TYPE_STR: float, sizeof(ELEM_TYPE): 4
>>> [INFO] CL_ELEM_TYPE_STR: float2, sizeof(CL_ELEM_TYPE): 8
>>> [WARN] ELEM_TYPE(float) size differs from CL_ELEM_TYPE(float2)
>>> [INFO] len_a: 1048576, len_b: 1048576, len_c: 1048576
>>> [INFO] data_size_a: 4194304, data_size_b: 4194304, data_size_c: 4194304

============== CPU RESULT ==============
>>> [INFO] 0 times CPU starting...
0        30.868511
>>> [INFO] skip first 1 time(s)
>>> [INFO] CPU 1024x1024x1024 nan s nan GFLOPS

============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec2x2_continue
>>> [INFO] global_work_size[3]: { 512, 512, 1 }
>>> [WARN] global work size (262144) is smaller than task size (1048576)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec2x2_continue starting ...
0        0.456370
>>> [INFO] skip first 1 time(s)
1        0.453818
2        0.454120
3        0.453127
4        0.455174
5        0.453690
6        0.455011
7        0.454270
8        0.453999
9        0.454489
10       0.453811
>>> [INFO] CL_GPU 1024x1024x1024 0.454151 s 4.728569 GFLOPS

>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b

ysh329 / OpenCL-101

Strange problem about index of vector variable #16

mat_mult_vec1x2_continue

mat_mult_vec2x2_continue

mat_mult_vec1x2_continue

Code

Result

mat_mult_vec2x2_continue

Code

Result