Closed MikkoArtik closed 3 years ago
Hi! I can't undestand where I make mistake. Target - fill array with unique work id.
I found magic number - q = 16777216 = 1024 1024 16. If array length less than q, all good: check_array function (see below) returns True with params:
arr_size = 1024 * 1024 * 16 grid_size = (1024, 1024, 16) block_size = (32, 8, 4) check_array(...) -> True
BUT...
arr_size = 1024 * 1024 * 20 grid_size = (1024, 1024, 20) block_size = (32, 8, 4) check_array(...) -> False
I made slice arr[16777216:1677726]
arr[16777216:1677726]
In theory result must be arr[16777216:1677726] = [16777216, 16777217, 16777218, 16777219, 16777220, 16777221, 16777222, 16777223, 16777224, 16777225]
arr[16777216:1677726] = [16777216, 16777217, 16777218, 16777219, 16777220, 16777221, 16777222, 16777223, 16777224, 16777225]
But in fact arr[16777216:1677726] = [16777216, **16777216**, 16777218, **16777220**, 16777220, **16777220**, 16777222, **16777224**, 16777224, **16777224**]
arr[16777216:1677726] = [16777216, **16777216**, 16777218, **16777220**, 16777220, **16777220**, 16777222, **16777224**, 16777224, **16777224**]
please, help me.
GPU-card is Nvidia GeForce GTX 1050 Ti
Open CL core:
int get_general_block_id(int3 block_ids) { int3 grid_size = {get_num_groups(0), get_num_groups(1), get_num_groups(2)}; return block_ids.s0 + block_ids.s1 * grid_size.s0 + block_ids.s2 * grid_size.s0 * grid_size.s1; } int get_local_thread_id(int3 thread_ids) { int3 block_size = {get_local_size(0), get_local_size(1), get_local_size(2)}; return thread_ids.s0 + thread_ids.s1 * block_size.s0 + thread_ids.s2 * block_size.s0 * block_size.s1; } int get_global_thread_id(int gen_block_id, int gen_thread_id) { int3 block_size = {get_local_size(0), get_local_size(1), get_local_size(2)}; return gen_block_id * block_size.s0 * block_size.s1 * block_size.s2 + gen_thread_id; } kernel void index_function(global float *array, int length) { int3 b_ids = {get_group_id(0), get_group_id(1), get_group_id(2)}; int gb_id = get_general_block_id(b_ids); int3 t_ids = {get_local_id(0), get_local_id(1), get_local_id(2)}; int lt_id = get_local_thread_id(t_ids); int global_tid = get_global_thread_id(gb_id, lt_id); if (global_tid >= length) { return; } array[global_tid] = global_tid; if ((global_tid >= 16777210) && (global_tid <= 16777245)) { printf("block_id=%i thread_id=%i gid=%i val=%f\n", gb_id, lt_id, global_tid, array[global_tid]); } }
Python code:
import numpy as np import pyopencl as cl def load_core(filename='core.c') -> str: core_text = '' with open(filename, 'r') as f: for line in f: core_text += line return core_text def create_context_and_queue(): # Run only one device! platforms = cl.get_platforms() gpu_dev = [] for platform_item in platforms: devs = platform_item.get_devices(device_type=cl.device_type.GPU) gpu_dev += devs context = cl.Context(devices=[gpu_dev[0]]) queue = cl.CommandQueue(context) return context, queue def compile_function(context, filename='core.c'): core = load_core(filename) module = cl.Program(context, core).build() return module.index_function def check_array(arr: np.ndarray): for i, item in enumerate(arr): delta = abs(i - item) if delta > 1e-5: return False else: return True arr_size = 1000 * 1000 * 1000 grid_size = (1000, 1000, 60) block_size = (10, 10, 10) if __name__ == '__main__': context, queue = create_context_and_queue() cl_function = compile_function(context) arr = np.zeros(shape=arr_size, dtype=np.float32) arr.fill(-1) mf = cl.mem_flags a_g = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=arr) cl_function(queue, grid_size, block_size, a_g, np.uint32(arr_size)) result_arr = cl.Buffer(context, mf.WRITE_ONLY, arr.nbytes) cl.enqueue_copy(queue, arr, a_g) print(check_array(arr))
CommandLine Out:
block_id=16384 thread_id=22 gid=16777238 val=16777238.000000 block_id=16384 thread_id=23 gid=16777239 val=16777240.000000 block_id=16384 thread_id=24 gid=16777240 val=16777240.000000 block_id=16384 thread_id=25 gid=16777241 val=16777240.000000 block_id=16384 thread_id=26 gid=16777242 val=16777242.000000 block_id=16384 thread_id=27 gid=16777243 val=16777244.000000 block_id=16384 thread_id=28 gid=16777244 val=16777244.000000 block_id=16384 thread_id=29 gid=16777245 val=16777244.000000
Why array element not equal assignable value?
Update I changed type of array to int and fix it
Hi! I can't undestand where I make mistake. Target - fill array with unique work id.
I found magic number - q = 16777216 = 1024 1024 16. If array length less than q, all good: check_array function (see below) returns True with params:
BUT...
I made slice
arr[16777216:1677726]
In theory result must be
arr[16777216:1677726] = [16777216, 16777217, 16777218, 16777219, 16777220, 16777221, 16777222, 16777223, 16777224, 16777225]
But in fact
arr[16777216:1677726] = [16777216, **16777216**, 16777218, **16777220**, 16777220, **16777220**, 16777222, **16777224**, 16777224, **16777224**]
please, help me.
GPU-card is Nvidia GeForce GTX 1050 Ti
Open CL core:
Python code:
CommandLine Out:
Why array element not equal assignable value?