ddemidov / vexcl

VexCL is a C++ vector expression template library for OpenCL/CUDA/OpenMP
http://vexcl.readthedocs.org
MIT License
702 stars 82 forks source link

Not working sorting by key with longer integers and OpenGL #215

Open ghost opened 8 years ago

ghost commented 8 years ago

Hello. I trying add vexcl sorting with OpenGL buffers, that also uses int64. https://github.com/acterhd/magnum-tracer/blob/master/include/tobject.hpp

I tried, but not working sorting for opengl buffers.

Although I kind of respect all the laws of bits and bytes. And again, I try with the structures but not c ++ compiled with them at all. So replaced with long2.

I thinking and means that long == int64_t. Also, I'm sure that SSBO and opencl buffer are equivalent.

ddemidov commented 8 years ago

I am sorry, I have little experience with OpenCL to OpenGL interaction. Also, I don't really understand what you are saying here. Could you provide a minimal, single-file example demonstrating the problem that I could compile and test (on linux)?

I thinking and means that long == int64_t.

sizeof(cl_long) == sizeof(int64_t), but on Windows, in host code, sizeof(long) == sizeof(int32_t). Could that be a problem?

ghost commented 8 years ago

Error and code.

``` bool comp ( int a1, long a2, int b1, long b2 ) { return (a1 == b1) ? (a2 < b2) : (a1 < b1); } int4 find_mergesort_frame ( int coop, int block, int nv ) { int start = ~(coop - 1) & block; int size = nv * (coop>> 1); int4 frame; frame.x = nv * start; frame.y = nv * start + size; frame.z = size; return frame; } int4 find_mergesort_interval ( int4 frame, int coop, int block, int nv, int count, int mp0, int mp1 ) { int diag = nv * block - frame.x; int4 interval; interval.x = frame.x + mp0; interval.y = min(count, frame.x + mp1); interval.z = min(count, frame.y + diag - mp0); interval.w = min(count, frame.y + diag + nv - mp1); if(coop - 1 == ((coop - 1) & block)) { interval.y = min(count, frame.x + frame.z); interval.w = min(count, frame.y + frame.z); } return interval; } int4 compute_merge_range ( int a_count, int b_count, int block, int coop, int nv, global const int * mp_global ) { int mp0 = mp_global[block]; int mp1 = mp_global[block + 1]; int gid = nv * block; int4 range; if(coop) { int4 frame = find_mergesort_frame(coop, block, nv); range = find_mergesort_interval(frame, coop, block, nv, a_count, mp0, mp1); } else { range.x = mp0; range.y = mp1; range.z = gid - range.x; range.w = min(a_count + b_count, gid + nv) - range.y; } return range; } void global_to_regstr_pred_256_11_long2 ( int count, global const long2 * data, int tid, long2 * reg ) { int index; index = 0 + tid; if (index < count) reg[0] = data[index]; index = 256 + tid; if (index < count) reg[1] = data[index]; index = 512 + tid; if (index < count) reg[2] = data[index]; index = 768 + tid; if (index < count) reg[3] = data[index]; index = 1024 + tid; if (index < count) reg[4] = data[index]; index = 1280 + tid; if (index < count) reg[5] = data[index]; index = 1536 + tid; if (index < count) reg[6] = data[index]; index = 1792 + tid; if (index < count) reg[7] = data[index]; index = 2048 + tid; if (index < count) reg[8] = data[index]; index = 2304 + tid; if (index < count) reg[9] = data[index]; index = 2560 + tid; if (index < count) reg[10] = data[index]; } void global_to_regstr_256_11_long2 ( int count, global const long2 * data, int tid, long2 * reg ) { if (count >= 2816) { reg[0] = data[0 + tid]; reg[1] = data[256 + tid]; reg[2] = data[512 + tid]; reg[3] = data[768 + tid]; reg[4] = data[1024 + tid]; reg[5] = data[1280 + tid]; reg[6] = data[1536 + tid]; reg[7] = data[1792 + tid]; reg[8] = data[2048 + tid]; reg[9] = data[2304 + tid]; reg[10] = data[2560 + tid]; } else global_to_regstr_pred_256_11_long2(count, data, tid, reg); } void regstr_to_global_256_11_long2 ( int count, const long2 * reg, int tid, global long2 * dest ) { int index; index = 0 + tid; if (index < count) dest[index] = reg[0]; index = 256 + tid; if (index < count) dest[index] = reg[1]; index = 512 + tid; if (index < count) dest[index] = reg[2]; index = 768 + tid; if (index < count) dest[index] = reg[3]; index = 1024 + tid; if (index < count) dest[index] = reg[4]; index = 1280 + tid; if (index < count) dest[index] = reg[5]; index = 1536 + tid; if (index < count) dest[index] = reg[6]; index = 1792 + tid; if (index < count) dest[index] = reg[7]; index = 2048 + tid; if (index < count) dest[index] = reg[8]; index = 2304 + tid; if (index < count) dest[index] = reg[9]; index = 2560 + tid; if (index < count) dest[index] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void shared_to_regstr_256_11_long2 ( local const long2 * data, int tid, long2 * reg ) { reg[0] = data[0 + tid]; reg[1] = data[256 + tid]; reg[2] = data[512 + tid]; reg[3] = data[768 + tid]; reg[4] = data[1024 + tid]; reg[5] = data[1280 + tid]; reg[6] = data[1536 + tid]; reg[7] = data[1792 + tid]; reg[8] = data[2048 + tid]; reg[9] = data[2304 + tid]; reg[10] = data[2560 + tid]; barrier(CLK_LOCAL_MEM_FENCE); } void regstr_to_shared_256_11_long2 ( const long2 * reg, int tid, local long2 * dest ) { dest[0 + tid] = reg[0]; dest[256 + tid] = reg[1]; dest[512 + tid] = reg[2]; dest[768 + tid] = reg[3]; dest[1024 + tid] = reg[4]; dest[1280 + tid] = reg[5]; dest[1536 + tid] = reg[6]; dest[1792 + tid] = reg[7]; dest[2048 + tid] = reg[8]; dest[2304 + tid] = reg[9]; dest[2560 + tid] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void global_to_shared_256_11_long2 ( int count, global const long2 * source, int tid, local long2 * dest ) { long2 reg[11]; global_to_regstr_256_11_long2(count, source, tid, reg); regstr_to_shared_256_11_long2(reg, tid, dest); } void shared_to_global_256_11_long2 ( int count, local const long2 * source, int tid, global long2 * dest ) { int index; index = 0 + tid; if (index < count) dest[index] = source[index]; index = 256 + tid; if (index < count) dest[index] = source[index]; index = 512 + tid; if (index < count) dest[index] = source[index]; index = 768 + tid; if (index < count) dest[index] = source[index]; index = 1024 + tid; if (index < count) dest[index] = source[index]; index = 1280 + tid; if (index < count) dest[index] = source[index]; index = 1536 + tid; if (index < count) dest[index] = source[index]; index = 1792 + tid; if (index < count) dest[index] = source[index]; index = 2048 + tid; if (index < count) dest[index] = source[index]; index = 2304 + tid; if (index < count) dest[index] = source[index]; index = 2560 + tid; if (index < count) dest[index] = source[index]; barrier(CLK_LOCAL_MEM_FENCE); } void shared_to_thread_11_long2 ( local const long2 * data, int tid, long2 * reg ) { reg[0] = data[11 * tid + 0]; reg[1] = data[11 * tid + 1]; reg[2] = data[11 * tid + 2]; reg[3] = data[11 * tid + 3]; reg[4] = data[11 * tid + 4]; reg[5] = data[11 * tid + 5]; reg[6] = data[11 * tid + 6]; reg[7] = data[11 * tid + 7]; reg[8] = data[11 * tid + 8]; reg[9] = data[11 * tid + 9]; reg[10] = data[11 * tid + 10]; barrier(CLK_LOCAL_MEM_FENCE); } void thread_to_shared_11_long2 ( const long2 * reg, int tid, local long2 * dest ) { dest[11 * tid + 0] = reg[0]; dest[11 * tid + 1] = reg[1]; dest[11 * tid + 2] = reg[2]; dest[11 * tid + 3] = reg[3]; dest[11 * tid + 4] = reg[4]; dest[11 * tid + 5] = reg[5]; dest[11 * tid + 6] = reg[6]; dest[11 * tid + 7] = reg[7]; dest[11 * tid + 8] = reg[8]; dest[11 * tid + 9] = reg[9]; dest[11 * tid + 10] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void global_to_regstr_pred_256_11_long ( int count, global const long * data, int tid, long * reg ) { int index; index = 0 + tid; if (index < count) reg[0] = data[index]; index = 256 + tid; if (index < count) reg[1] = data[index]; index = 512 + tid; if (index < count) reg[2] = data[index]; index = 768 + tid; if (index < count) reg[3] = data[index]; index = 1024 + tid; if (index < count) reg[4] = data[index]; index = 1280 + tid; if (index < count) reg[5] = data[index]; index = 1536 + tid; if (index < count) reg[6] = data[index]; index = 1792 + tid; if (index < count) reg[7] = data[index]; index = 2048 + tid; if (index < count) reg[8] = data[index]; index = 2304 + tid; if (index < count) reg[9] = data[index]; index = 2560 + tid; if (index < count) reg[10] = data[index]; } void global_to_regstr_256_11_long ( int count, global const long * data, int tid, long * reg ) { if (count >= 2816) { reg[0] = data[0 + tid]; reg[1] = data[256 + tid]; reg[2] = data[512 + tid]; reg[3] = data[768 + tid]; reg[4] = data[1024 + tid]; reg[5] = data[1280 + tid]; reg[6] = data[1536 + tid]; reg[7] = data[1792 + tid]; reg[8] = data[2048 + tid]; reg[9] = data[2304 + tid]; reg[10] = data[2560 + tid]; } else global_to_regstr_pred_256_11_long(count, data, tid, reg); } void regstr_to_global_256_11_long ( int count, const long * reg, int tid, global long * dest ) { int index; index = 0 + tid; if (index < count) dest[index] = reg[0]; index = 256 + tid; if (index < count) dest[index] = reg[1]; index = 512 + tid; if (index < count) dest[index] = reg[2]; index = 768 + tid; if (index < count) dest[index] = reg[3]; index = 1024 + tid; if (index < count) dest[index] = reg[4]; index = 1280 + tid; if (index < count) dest[index] = reg[5]; index = 1536 + tid; if (index < count) dest[index] = reg[6]; index = 1792 + tid; if (index < count) dest[index] = reg[7]; index = 2048 + tid; if (index < count) dest[index] = reg[8]; index = 2304 + tid; if (index < count) dest[index] = reg[9]; index = 2560 + tid; if (index < count) dest[index] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void shared_to_regstr_256_11_long ( local const long * data, int tid, long * reg ) { reg[0] = data[0 + tid]; reg[1] = data[256 + tid]; reg[2] = data[512 + tid]; reg[3] = data[768 + tid]; reg[4] = data[1024 + tid]; reg[5] = data[1280 + tid]; reg[6] = data[1536 + tid]; reg[7] = data[1792 + tid]; reg[8] = data[2048 + tid]; reg[9] = data[2304 + tid]; reg[10] = data[2560 + tid]; barrier(CLK_LOCAL_MEM_FENCE); } void regstr_to_shared_256_11_long ( const long * reg, int tid, local long * dest ) { dest[0 + tid] = reg[0]; dest[256 + tid] = reg[1]; dest[512 + tid] = reg[2]; dest[768 + tid] = reg[3]; dest[1024 + tid] = reg[4]; dest[1280 + tid] = reg[5]; dest[1536 + tid] = reg[6]; dest[1792 + tid] = reg[7]; dest[2048 + tid] = reg[8]; dest[2304 + tid] = reg[9]; dest[2560 + tid] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void global_to_shared_256_11_long ( int count, global const long * source, int tid, local long * dest ) { long reg[11]; global_to_regstr_256_11_long(count, source, tid, reg); regstr_to_shared_256_11_long(reg, tid, dest); } void shared_to_global_256_11_long ( int count, local const long * source, int tid, global long * dest ) { int index; index = 0 + tid; if (index < count) dest[index] = source[index]; index = 256 + tid; if (index < count) dest[index] = source[index]; index = 512 + tid; if (index < count) dest[index] = source[index]; index = 768 + tid; if (index < count) dest[index] = source[index]; index = 1024 + tid; if (index < count) dest[index] = source[index]; index = 1280 + tid; if (index < count) dest[index] = source[index]; index = 1536 + tid; if (index < count) dest[index] = source[index]; index = 1792 + tid; if (index < count) dest[index] = source[index]; index = 2048 + tid; if (index < count) dest[index] = source[index]; index = 2304 + tid; if (index < count) dest[index] = source[index]; index = 2560 + tid; if (index < count) dest[index] = source[index]; barrier(CLK_LOCAL_MEM_FENCE); } void shared_to_thread_11_long ( local const long * data, int tid, long * reg ) { reg[0] = data[11 * tid + 0]; reg[1] = data[11 * tid + 1]; reg[2] = data[11 * tid + 2]; reg[3] = data[11 * tid + 3]; reg[4] = data[11 * tid + 4]; reg[5] = data[11 * tid + 5]; reg[6] = data[11 * tid + 6]; reg[7] = data[11 * tid + 7]; reg[8] = data[11 * tid + 8]; reg[9] = data[11 * tid + 9]; reg[10] = data[11 * tid + 10]; barrier(CLK_LOCAL_MEM_FENCE); } void thread_to_shared_11_long ( const long * reg, int tid, local long * dest ) { dest[11 * tid + 0] = reg[0]; dest[11 * tid + 1] = reg[1]; dest[11 * tid + 2] = reg[2]; dest[11 * tid + 3] = reg[3]; dest[11 * tid + 4] = reg[4]; dest[11 * tid + 5] = reg[5]; dest[11 * tid + 6] = reg[6]; dest[11 * tid + 7] = reg[7]; dest[11 * tid + 8] = reg[8]; dest[11 * tid + 9] = reg[9]; dest[11 * tid + 10] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void global_to_regstr_pred_256_11_int ( int count, global const int * data, int tid, int * reg ) { int index; index = 0 + tid; if (index < count) reg[0] = data[index]; index = 256 + tid; if (index < count) reg[1] = data[index]; index = 512 + tid; if (index < count) reg[2] = data[index]; index = 768 + tid; if (index < count) reg[3] = data[index]; index = 1024 + tid; if (index < count) reg[4] = data[index]; index = 1280 + tid; if (index < count) reg[5] = data[index]; index = 1536 + tid; if (index < count) reg[6] = data[index]; index = 1792 + tid; if (index < count) reg[7] = data[index]; index = 2048 + tid; if (index < count) reg[8] = data[index]; index = 2304 + tid; if (index < count) reg[9] = data[index]; index = 2560 + tid; if (index < count) reg[10] = data[index]; } void global_to_regstr_256_11_int ( int count, global const int * data, int tid, int * reg ) { if (count >= 2816) { reg[0] = data[0 + tid]; reg[1] = data[256 + tid]; reg[2] = data[512 + tid]; reg[3] = data[768 + tid]; reg[4] = data[1024 + tid]; reg[5] = data[1280 + tid]; reg[6] = data[1536 + tid]; reg[7] = data[1792 + tid]; reg[8] = data[2048 + tid]; reg[9] = data[2304 + tid]; reg[10] = data[2560 + tid]; } else global_to_regstr_pred_256_11_int(count, data, tid, reg); } void regstr_to_global_256_11_int ( int count, const int * reg, int tid, global int * dest ) { int index; index = 0 + tid; if (index < count) dest[index] = reg[0]; index = 256 + tid; if (index < count) dest[index] = reg[1]; index = 512 + tid; if (index < count) dest[index] = reg[2]; index = 768 + tid; if (index < count) dest[index] = reg[3]; index = 1024 + tid; if (index < count) dest[index] = reg[4]; index = 1280 + tid; if (index < count) dest[index] = reg[5]; index = 1536 + tid; if (index < count) dest[index] = reg[6]; index = 1792 + tid; if (index < count) dest[index] = reg[7]; index = 2048 + tid; if (index < count) dest[index] = reg[8]; index = 2304 + tid; if (index < count) dest[index] = reg[9]; index = 2560 + tid; if (index < count) dest[index] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void shared_to_regstr_256_11_int ( local const int * data, int tid, int * reg ) { reg[0] = data[0 + tid]; reg[1] = data[256 + tid]; reg[2] = data[512 + tid]; reg[3] = data[768 + tid]; reg[4] = data[1024 + tid]; reg[5] = data[1280 + tid]; reg[6] = data[1536 + tid]; reg[7] = data[1792 + tid]; reg[8] = data[2048 + tid]; reg[9] = data[2304 + tid]; reg[10] = data[2560 + tid]; barrier(CLK_LOCAL_MEM_FENCE); } void regstr_to_shared_256_11_int ( const int * reg, int tid, local int * dest ) { dest[0 + tid] = reg[0]; dest[256 + tid] = reg[1]; dest[512 + tid] = reg[2]; dest[768 + tid] = reg[3]; dest[1024 + tid] = reg[4]; dest[1280 + tid] = reg[5]; dest[1536 + tid] = reg[6]; dest[1792 + tid] = reg[7]; dest[2048 + tid] = reg[8]; dest[2304 + tid] = reg[9]; dest[2560 + tid] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void global_to_shared_256_11_int ( int count, global const int * source, int tid, local int * dest ) { int reg[11]; global_to_regstr_256_11_int(count, source, tid, reg); regstr_to_shared_256_11_int(reg, tid, dest); } void shared_to_global_256_11_int ( int count, local const int * source, int tid, global int * dest ) { int index; index = 0 + tid; if (index < count) dest[index] = source[index]; index = 256 + tid; if (index < count) dest[index] = source[index]; index = 512 + tid; if (index < count) dest[index] = source[index]; index = 768 + tid; if (index < count) dest[index] = source[index]; index = 1024 + tid; if (index < count) dest[index] = source[index]; index = 1280 + tid; if (index < count) dest[index] = source[index]; index = 1536 + tid; if (index < count) dest[index] = source[index]; index = 1792 + tid; if (index < count) dest[index] = source[index]; index = 2048 + tid; if (index < count) dest[index] = source[index]; index = 2304 + tid; if (index < count) dest[index] = source[index]; index = 2560 + tid; if (index < count) dest[index] = source[index]; barrier(CLK_LOCAL_MEM_FENCE); } void shared_to_thread_11_int ( local const int * data, int tid, int * reg ) { reg[0] = data[11 * tid + 0]; reg[1] = data[11 * tid + 1]; reg[2] = data[11 * tid + 2]; reg[3] = data[11 * tid + 3]; reg[4] = data[11 * tid + 4]; reg[5] = data[11 * tid + 5]; reg[6] = data[11 * tid + 6]; reg[7] = data[11 * tid + 7]; reg[8] = data[11 * tid + 8]; reg[9] = data[11 * tid + 9]; reg[10] = data[11 * tid + 10]; barrier(CLK_LOCAL_MEM_FENCE); } void thread_to_shared_11_int ( const int * reg, int tid, local int * dest ) { dest[11 * tid + 0] = reg[0]; dest[11 * tid + 1] = reg[1]; dest[11 * tid + 2] = reg[2]; dest[11 * tid + 3] = reg[3]; dest[11 * tid + 4] = reg[4]; dest[11 * tid + 5] = reg[5]; dest[11 * tid + 6] = reg[6]; dest[11 * tid + 7] = reg[7]; dest[11 * tid + 8] = reg[8]; dest[11 * tid + 9] = reg[9]; dest[11 * tid + 10] = reg[10]; barrier(CLK_LOCAL_MEM_FENCE); } void serial_merge_11_long_long ( int a_begin, int a_end, int b_begin, int b_end, int * indices, local const long * keys_shared0, local const long * keys_shared1, long * results0, long * results1 ) { long a_key0 = keys_shared0[a_begin]; long b_key0 = keys_shared0[b_begin]; long a_key1 = keys_shared1[a_begin]; long b_key1 = keys_shared1[b_begin]; bool p; p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[0] = p ? a_key0 : b_key0; results1[0] = p ? a_key1 : b_key1; indices[0] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[1] = p ? a_key0 : b_key0; results1[1] = p ? a_key1 : b_key1; indices[1] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[2] = p ? a_key0 : b_key0; results1[2] = p ? a_key1 : b_key1; indices[2] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[3] = p ? a_key0 : b_key0; results1[3] = p ? a_key1 : b_key1; indices[3] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[4] = p ? a_key0 : b_key0; results1[4] = p ? a_key1 : b_key1; indices[4] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[5] = p ? a_key0 : b_key0; results1[5] = p ? a_key1 : b_key1; indices[5] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[6] = p ? a_key0 : b_key0; results1[6] = p ? a_key1 : b_key1; indices[6] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[7] = p ? a_key0 : b_key0; results1[7] = p ? a_key1 : b_key1; indices[7] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[8] = p ? a_key0 : b_key0; results1[8] = p ? a_key1 : b_key1; indices[8] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[9] = p ? a_key0 : b_key0; results1[9] = p ? a_key1 : b_key1; indices[9] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } p = (b_begin >= b_end) || ((a_begin < a_end) && !comp(b_key0, b_key1, a_key0, a_key1)); results0[10] = p ? a_key0 : b_key0; results1[10] = p ? a_key1 : b_key1; indices[10] = p ? a_begin : b_begin; if(p) { ++a_begin; a_key0 = keys_shared0[a_begin]; a_key1 = keys_shared1[a_begin]; } else { ++b_begin; b_key0 = keys_shared0[b_begin]; b_key1 = keys_shared1[b_begin]; } barrier(CLK_LOCAL_MEM_FENCE); } int merge_path_long_long ( int a_count, int b_count, int diag, local const long * a0, local const long * a1, local const long * b0, local const long * b1 ) { int begin = max(0, diag - b_count); int end = min(diag, a_count); while (begin < end) { int mid = (begin + end) >> 1; if ( !comp(b0[diag - 1 - mid], b1[diag - 1 - mid], a0[mid], a1[mid]) ) begin = mid + 1; else end = mid; } return begin; } void load2_to_regstr_256_11_11_long ( global const long * a_global, int a_count, global const long * b_global, int b_count, int tid, long * reg ) { b_global -= a_count; int total = a_count + b_count; int index; if (total >= 2816) { index = 0 + tid; if (index < a_count) reg[0] = a_global[index]; else reg[0] = b_global[index]; index = 256 + tid; if (index < a_count) reg[1] = a_global[index]; else reg[1] = b_global[index]; index = 512 + tid; if (index < a_count) reg[2] = a_global[index]; else reg[2] = b_global[index]; index = 768 + tid; if (index < a_count) reg[3] = a_global[index]; else reg[3] = b_global[index]; index = 1024 + tid; if (index < a_count) reg[4] = a_global[index]; else reg[4] = b_global[index]; index = 1280 + tid; if (index < a_count) reg[5] = a_global[index]; else reg[5] = b_global[index]; index = 1536 + tid; if (index < a_count) reg[6] = a_global[index]; else reg[6] = b_global[index]; index = 1792 + tid; if (index < a_count) reg[7] = a_global[index]; else reg[7] = b_global[index]; index = 2048 + tid; if (index < a_count) reg[8] = a_global[index]; else reg[8] = b_global[index]; index = 2304 + tid; if (index < a_count) reg[9] = a_global[index]; else reg[9] = b_global[index]; index = 2560 + tid; if (index < a_count) reg[10] = a_global[index]; else reg[10] = b_global[index]; } else { index = 0 + tid; if (index < a_count) reg[0] = a_global[index]; else if (index < total) reg[0] = b_global[index]; index = 256 + tid; if (index < a_count) reg[1] = a_global[index]; else if (index < total) reg[1] = b_global[index]; index = 512 + tid; if (index < a_count) reg[2] = a_global[index]; else if (index < total) reg[2] = b_global[index]; index = 768 + tid; if (index < a_count) reg[3] = a_global[index]; else if (index < total) reg[3] = b_global[index]; index = 1024 + tid; if (index < a_count) reg[4] = a_global[index]; else if (index < total) reg[4] = b_global[index]; index = 1280 + tid; if (index < a_count) reg[5] = a_global[index]; else if (index < total) reg[5] = b_global[index]; index = 1536 + tid; if (index < a_count) reg[6] = a_global[index]; else if (index < total) reg[6] = b_global[index]; index = 1792 + tid; if (index < a_count) reg[7] = a_global[index]; else if (index < total) reg[7] = b_global[index]; index = 2048 + tid; if (index < a_count) reg[8] = a_global[index]; else if (index < total) reg[8] = b_global[index]; index = 2304 + tid; if (index < a_count) reg[9] = a_global[index]; else if (index < total) reg[9] = b_global[index]; index = 2560 + tid; if (index < a_count) reg[10] = a_global[index]; else if (index < total) reg[10] = b_global[index]; } } void load2_to_shared_256_11_11_long ( global const long * a_global, int a_count, global const long * b_global, int b_count, int tid, local long * shared ) { long reg[11]; load2_to_regstr_256_11_11_long(a_global, a_count, b_global, b_count, tid, reg); regstr_to_shared_256_11_long(reg, tid, shared); } void merge_keys_indices_256_11_long_long ( int a_count, int b_count, int4 range, int tid, int * indices, global const long * a_global0, global const long * a_global1, global const long * b_global0, global const long * b_global1, local long * keys_shared0, local long * keys_shared1, long * results0, long * results1 ) { int a0 = range.x; int a1 = range.y; int b0 = range.z; int b1 = range.w; a_count = a1 - a0; b_count = b1 - b0; load2_to_shared_256_11_11_long(a_global0 + a0, a_count, b_global0 + b0, b_count, tid, keys_shared0); load2_to_shared_256_11_11_long(a_global1 + a0, a_count, b_global1 + b0, b_count, tid, keys_shared1); int diag = 11 * tid; int mp = merge_path_long_long(a_count, b_count, diag, keys_shared0, keys_shared1, keys_shared0 + a_count, keys_shared1 + a_count); int a0tid = mp; int a1tid = a_count; int b0tid = a_count + diag - mp; int b1tid = a_count + b_count; serial_merge_11_long_long(a0tid, a1tid, b0tid, b1tid, indices, keys_shared0, keys_shared1, results0, results1); } void transfer_merge_values_regstr_256_11_long2 ( int count, int b_start, const int * indices, int tid, global const long2 * a_global0, global const long2 * b_global0, long2 * reg0 ) { b_global0 -= b_start; if(count >= 2816) { if (indices[0] < b_start) { reg0[0] = a_global0[indices[0]]; } else { reg0[0] = b_global0[indices[0]]; } if (indices[1] < b_start) { reg0[1] = a_global0[indices[1]]; } else { reg0[1] = b_global0[indices[1]]; } if (indices[2] < b_start) { reg0[2] = a_global0[indices[2]]; } else { reg0[2] = b_global0[indices[2]]; } if (indices[3] < b_start) { reg0[3] = a_global0[indices[3]]; } else { reg0[3] = b_global0[indices[3]]; } if (indices[4] < b_start) { reg0[4] = a_global0[indices[4]]; } else { reg0[4] = b_global0[indices[4]]; } if (indices[5] < b_start) { reg0[5] = a_global0[indices[5]]; } else { reg0[5] = b_global0[indices[5]]; } if (indices[6] < b_start) { reg0[6] = a_global0[indices[6]]; } else { reg0[6] = b_global0[indices[6]]; } if (indices[7] < b_start) { reg0[7] = a_global0[indices[7]]; } else { reg0[7] = b_global0[indices[7]]; } if (indices[8] < b_start) { reg0[8] = a_global0[indices[8]]; } else { reg0[8] = b_global0[indices[8]]; } if (indices[9] < b_start) { reg0[9] = a_global0[indices[9]]; } else { reg0[9] = b_global0[indices[9]]; } if (indices[10] < b_start) { reg0[10] = a_global0[indices[10]]; } else { reg0[10] = b_global0[indices[10]]; } } else { int index; index = 0 + tid; if(index < count) { if (indices[0] < b_start) { reg0[0] = a_global0[indices[0]]; } else { reg0[0] = b_global0[indices[0]]; } } index = 256 + tid; if(index < count) { if (indices[1] < b_start) { reg0[1] = a_global0[indices[1]]; } else { reg0[1] = b_global0[indices[1]]; } } index = 512 + tid; if(index < count) { if (indices[2] < b_start) { reg0[2] = a_global0[indices[2]]; } else { reg0[2] = b_global0[indices[2]]; } } index = 768 + tid; if(index < count) { if (indices[3] < b_start) { reg0[3] = a_global0[indices[3]]; } else { reg0[3] = b_global0[indices[3]]; } } index = 1024 + tid; if(index < count) { if (indices[4] < b_start) { reg0[4] = a_global0[indices[4]]; } else { reg0[4] = b_global0[indices[4]]; } } index = 1280 + tid; if(index < count) { if (indices[5] < b_start) { reg0[5] = a_global0[indices[5]]; } else { reg0[5] = b_global0[indices[5]]; } } index = 1536 + tid; if(index < count) { if (indices[6] < b_start) { reg0[6] = a_global0[indices[6]]; } else { reg0[6] = b_global0[indices[6]]; } } index = 1792 + tid; if(index < count) { if (indices[7] < b_start) { reg0[7] = a_global0[indices[7]]; } else { reg0[7] = b_global0[indices[7]]; } } index = 2048 + tid; if(index < count) { if (indices[8] < b_start) { reg0[8] = a_global0[indices[8]]; } else { reg0[8] = b_global0[indices[8]]; } } index = 2304 + tid; if(index < count) { if (indices[9] < b_start) { reg0[9] = a_global0[indices[9]]; } else { reg0[9] = b_global0[indices[9]]; } } index = 2560 + tid; if(index < count) { if (indices[10] < b_start) { reg0[10] = a_global0[indices[10]]; } else { reg0[10] = b_global0[indices[10]]; } } } barrier(CLK_LOCAL_MEM_FENCE); } void transfer_merge_values_shared_256_11_long2 ( int count, int b_start, local const int * indices_shared, int tid, global const long2 * a_global0, global const long2 * b_global0, global long2 * dest_global0 ) { int indices[11]; shared_to_regstr_256_11_int(indices_shared, tid, indices); long2 reg0[11]; transfer_merge_values_regstr_256_11_long2(count, b_start, indices, tid, a_global0, b_global0, reg0); regstr_to_global_256_11_long2(count, reg0, tid, dest_global0); } void device_merge_256_11_long_long_long2 ( int a_count, int b_count, global const long * a_keys_global0, global const long * a_keys_global1, global const long * b_keys_global0, global const long * b_keys_global1, global long * keys_global0, global long * keys_global1, local long * keys_shared0, local long * keys_shared1, global const long2 * a_vals_global0, global const long2 * b_vals_global0, global long2 * vals_global0, int tid, int block, int4 range, local int * indices_shared ) { long results0[11]; long results1[11]; int indices[11]; merge_keys_indices_256_11_long_long(a_count, b_count, range, tid, indices, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_shared0, keys_shared1, results0, results1); thread_to_shared_11_long(results0, tid, keys_shared0); thread_to_shared_11_long(results1, tid, keys_shared1); a_count = range.y - range.x; b_count = range.w - range.z; shared_to_global_256_11_long(a_count + b_count, keys_shared0, tid, keys_global0 + 2816 * block); shared_to_global_256_11_long(a_count + b_count, keys_shared1, tid, keys_global1 + 2816 * block); thread_to_shared_11_int(indices, tid, indices_shared); transfer_merge_values_shared_256_11_long2(a_count + b_count, a_count, indices_shared, tid, a_vals_global0 + range.x, b_vals_global0 + range.z, vals_global0 + 2816 * block); } kernel void merge ( int a_count, int b_count, global const long * a_keys_global0, global const long * a_keys_global1, global const long * b_keys_global0, global const long * b_keys_global1, global long * keys_global0, global long * keys_global1, global const long2 * a_vals_global0, global const long2 * b_vals_global0, global long2 * vals_global0, global const int * mp_global, int coop ) { union Shared { struct { long keys0[3072]; long keys1[3072]; }; int indices[2816]; }; local union Shared shared; int tid = get_local_id(0); int block = get_group_id(0); int4 range = compute_merge_range(a_count, b_count, block, coop, 2816, mp_global); device_merge_256_11_long_long_long2(a_count, b_count, a_keys_global0, a_keys_global1, b_keys_global0, b_keys_global1, keys_global0, keys_global1, shared.keys0, shared.keys1, a_vals_global0, b_vals_global0, vals_global0, tid, block, range, shared.indices); } ptxas error : Entry function 'merge' uses too much shared data (0xc008 bytes, 0xc000 max) ```

I'm solved first problem. But I get new error. Also, I had BSOD in Nvidia Pascal (GTX 1070).

Function: sorting

struct less_t {
            typedef bool result_type;
            less_t() {}

            VEX_DUAL_FUNCTOR(result_type, (cl_int, a1)(cl_long, a2)(cl_int, b1)(cl_long, b2),
                return (a1 == b1) ? (a2 < b2) : (a1 < b1);
            )
        } comp;

Also, I used boost compute.

                glFinish();

                boost::compute::opengl_buffer clkeys(c, triangleKeys.id());
                boost::compute::opengl_buffer clids(c, triangleIds.id());
                boost::compute::opengl_buffer clpair(c, trianglePair.id());

                boost::compute::opengl_enqueue_acquire_gl_objects(1, &clkeys.get(), q);
                boost::compute::opengl_enqueue_acquire_gl_objects(1, &clids.get(), q);
                boost::compute::opengl_enqueue_acquire_gl_objects(1, &clpair.get(), q);

                cl::CommandQueue qu(q.get());
                vex::vector<cl_long> clkeysv(qu, cl::BufferGL(clkeys.get()), size_t(nodeCount));
                vex::vector<cl_long> clidsv(qu, cl::BufferGL(clids.get()), size_t(nodeCount));
                vex::vector<cl_long2> clpairv(qu, cl::BufferGL(clpair.get()), size_t(nodeCount));
                vex::sort_by_key(boost::fusion::vector_tie(clidsv, clkeysv), clpairv, comp);

                boost::compute::opengl_enqueue_release_gl_objects(1, &clkeys.get(), q);
                boost::compute::opengl_enqueue_release_gl_objects(1, &clids.get(), q);
                boost::compute::opengl_enqueue_release_gl_objects(1, &clpair.get(), q);

                q.finish();
ddemidov commented 8 years ago

ptxas error : Entry function 'merge' uses too much shared data (0xc008 bytes, 0xc000 max)

You could try to reduce the block size (and the required shared memory size) by changing 256 to 128 here:

https://github.com/ddemidov/vexcl/blob/6578ce2c4d77f01971a6e896f42eb0164de637ed/vexcl/sort.hpp#L1791

Would that work for you? Also, since you are using boost.compute anyway, you could try to switch to their sort algorithm.

ghost commented 8 years ago

I simplified code, but get errors. Error code: CL_MAP_FAILURE

                glFinish();

                cl_mem cl_buf = clCreateFromGLBuffer((*c)(), CL_MEM_READ_WRITE, trianglePair.id(), nullptr);
                clEnqueueAcquireGLObjects((*q)(), 1, &cl_buf, 0, 0, 0);
                vex::vector<cl_int2> p(*q, cl::Buffer(cl_buf), nodeCount);
                vex::sort(p, sortfunc<cl_int2>());
                clEnqueueReleaseGLObjects((*q)(), 1, &cl_buf, 0, 0, 0);
                try {
                    q->finish();
                }
                catch (cl::Error) {

                }

Also

            std::vector<cl::Device> device = vex::backend::device_list(
                vex::Filter::Env       &&
                vex::Filter::GLSharing &&
                vex::Filter::Count(1)
            );
            cl_context_properties ctx_prop[] = {
                CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
                CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
                CL_CONTEXT_PLATFORM, (cl_context_properties)device[0].getInfo<CL_DEVICE_PLATFORM>(),
                0
            };
            c = std::make_shared<cl::Context>(device, ctx_prop);
            q = std::make_shared<cl::CommandQueue>(*c, device[0]);
ddemidov commented 8 years ago

I am sorry, I can not reproduce this, so I won't be able to help you unless you give me a minimal compilable test-case demonstrating the problem.

ghost commented 8 years ago

I think rewrite OpenGL API for C++ Now my path tracer disabled any sorting, and preparing to rewrite with next-gen API. I want to resolve any problems with OpenCL, and other API.