Closed ysh329 closed 7 years ago
Those bugs above were fixed. So careless I am! :rofl:
// No perf up! 1024x1024x1024 float
// float: naive: 0.59s this: 0.59s
__kernel void mat_mult_vec1x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
const int col = get_global_id(0) << 1;
const int row = get_global_id(1);
CL_ELEM_TYPE aa, bb1, bb2, cc = (CL_ELEM_TYPE)(0.0f, 0.0f);
for (int p = 0; p < K; p += 2) {
aa = *(
(__global CL_ELEM_TYPE *)(a + row * K + p)
);
bb1 = *(
(__global CL_ELEM_TYPE *)(b + p * N + col)
);
bb2 = *(
(__global CL_ELEM_TYPE *)(b + (p+1) * N + col)
);
cc.s0 += aa.s0 * bb1.s0 + aa.s1 * bb2.s0;
cc.s1 += aa.s0 * bb1.s1 + aa.s1 * bb2.s1;
}
c[row * N + col] = cc.s0;
c[row * N + (col+1)] = cc.s1;
}
$ taskset -c 4 ./matrixMultiplication 1024 1024 1024 ./vec2.cl mat_mult_vec1x2_continue 0 10 $[1024/2] 1024 1
============== INIT ==============
>>> [INFO] ELEM_TYPE_STR: float, sizeof(ELEM_TYPE): 4
>>> [INFO] CL_ELEM_TYPE_STR: float2, sizeof(CL_ELEM_TYPE): 8
>>> [WARN] ELEM_TYPE(float) size differs from CL_ELEM_TYPE(float2)
>>> [INFO] len_a: 1048576, len_b: 1048576, len_c: 1048576
>>> [INFO] data_size_a: 4194304, data_size_b: 4194304, data_size_c: 4194304
============== CPU RESULT ==============
>>> [INFO] 0 times CPU starting...
0 31.849833
>>> [INFO] skip first 1 time(s)
>>> [INFO] CPU 1024x1024x1024 nan s nan GFLOPS
============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec1x2_continue
>>> [INFO] global_work_size[3]: { 512, 1024, 1 }
>>> [WARN] global work size (524288) is smaller than task size (1048576)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec1x2_continue starting ...
0 0.591864
>>> [INFO] skip first 1 time(s)
1 0.567336
2 0.607514
3 0.654933
4 0.550752
5 0.618466
6 0.645612
7 0.640939
8 0.548702
9 0.575562
10 0.537694
>>> [INFO] CL_GPU 1024x1024x1024 0.594751 s 3.610727 GFLOPS
>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b
// perf up! 24%
// 1024x1024x1024
// naive float : 0.59s ; this float2: 0.45s
__kernel void mat_mult_vec2x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
const int col = get_global_id(0) << 1;
const int row = get_global_id(1) << 1;
CL_ELEM_TYPE aa1, aa2,
bb1, bb2,
cc1 = 0,
cc2 = 0;
for (int p = 0; p < K; p+=2) {
aa1 = *(
(__global CL_ELEM_TYPE *)(a + row * K + p)
);
aa2 = *(
(__global CL_ELEM_TYPE *)(a + (row+1) * K + p)
);
bb1 = *(
(__global CL_ELEM_TYPE *)(b + p * N + col)
);
bb2 = *(
(__global CL_ELEM_TYPE *)(b + (p+1) * N + col)
);
cc1.s0 += aa1.s0 * bb1.s0 + aa1.s1 * bb2.s0;
cc1.s1 += aa1.s0 * bb1.s1 + aa1.s1 * bb2.s1;
cc2.s0 += aa2.s0 * bb1.s0 + aa2.s1 * bb2.s0;
cc2.s1 += aa2.s0 * bb1.s1 + aa2.s1 * bb2.s1;
}
c[row * N + col] = cc1.s0; c[row * N + (col+1)] = cc1.s1;
c[(row+1) * N + col] = cc2.s0; c[(row+1) * N + (col+1)] = cc2.s1;
}
$ ./matrixMultiplication 1024 1024 1024 ./vec2.cl mat_mult_vec2x2_continue 0 10 $[1024/2] $[1024/2] 1
============== INIT ==============
>>> [INFO] ELEM_TYPE_STR: float, sizeof(ELEM_TYPE): 4
>>> [INFO] CL_ELEM_TYPE_STR: float2, sizeof(CL_ELEM_TYPE): 8
>>> [WARN] ELEM_TYPE(float) size differs from CL_ELEM_TYPE(float2)
>>> [INFO] len_a: 1048576, len_b: 1048576, len_c: 1048576
>>> [INFO] data_size_a: 4194304, data_size_b: 4194304, data_size_c: 4194304
============== CPU RESULT ==============
>>> [INFO] 0 times CPU starting...
0 30.868511
>>> [INFO] skip first 1 time(s)
>>> [INFO] CPU 1024x1024x1024 nan s nan GFLOPS
============== GPU RESULT ==============
>>> [INFO] Device name: Mali-T86x MP4 r2p0 0x0860
>>> [INFO] program_file: ./vec2.cl, kernel_func: mat_mult_vec2x2_continue
>>> [INFO] global_work_size[3]: { 512, 512, 1 }
>>> [WARN] global work size (262144) is smaller than task size (1048576)
>>> [INFO] CL_GPU 10 times ./vec2.cl.mat_mult_vec2x2_continue starting ...
0 0.456370
>>> [INFO] skip first 1 time(s)
1 0.453818
2 0.454120
3 0.453127
4 0.455174
5 0.453690
6 0.455011
7 0.454270
8 0.453999
9 0.454489
10 0.453811
>>> [INFO] CL_GPU 1024x1024x1024 0.454151 s 4.728569 GFLOPS
>>> [TEST] correct rate: 1.0000
>>> [TEST] ~ Bingo ~ matrix a == matrix b
Correct rate is strange: always is 0.0000, :cry:
mat_mult_vec1x2_continue
mat_mult_vec2x2_continue