Closed ysh329 closed 6 years ago
temporarily closed.
According row-first, we can use vector-variable so as to decrease the times of buffer reading, such as below:
// perf up! 24%
// 1024x1024x1024 0.458464 s 4.684079 GFLOPS
// naive float : 0.59s ; this float2: 0.45s
__kernel void mat_mult_vec2x2_continue(const int M, const int N, const int K, __global const CL_INPUT_TYPE *a, __global const CL_INPUT_TYPE *b, __global CL_INPUT_TYPE *c) {
const int col = get_global_id(0) << 1;
const int row = get_global_id(1) << 1;
CL_ELEM_TYPE aa1, aa2,
bb1, bb2,
cc1 = 0,
cc2 = 0;
for (int p = 0; p < K; p+=2) {
aa1 = *(
(__global CL_ELEM_TYPE *)(a + row * K + p)
);
aa2 = *(
(__global CL_ELEM_TYPE *)(a + (row+1) * K + p)
);
bb1 = *(
(__global CL_ELEM_TYPE *)(b + p * N + col)
);
bb2 = *(
(__global CL_ELEM_TYPE *)(b + (p+1) * N + col)
);
cc1.s0 += aa1.s0 * bb1.s0 + aa1.s1 * bb2.s0;
cc1.s1 += aa1.s0 * bb1.s1 + aa1.s1 * bb2.s1;
cc2.s0 += aa2.s0 * bb1.s0 + aa2.s1 * bb2.s0;
cc2.s1 += aa2.s0 * bb1.s1 + aa2.s1 * bb2.s1;
}
c[row * N + col] = cc1.s0; c[row * N + (col+1)] = cc1.s1;
c[(row+1) * N + col] = cc2.s0; c[(row+1) * N + (col+1)] = cc2.s1;
}