Open artemgl opened 1 year ago
Can it be caused by unexpected behaviour of overflow? If array y
initialized by 0uy
all works fine. But for the original code execution goes to else (Some res)
branch (you can check it using printf
function in this branch).
This prints "AA" for me and the code does work properly, but doesn't without printing
let op =
<@
fun x y ->
let mutable res = x * y
if res = 0uy then
printf "AA"
None
else
printf "BB"
(Some res)
@>
Well...
The following simplified version of kernel behaves wrong (array.[1]
is 33) on my Intel HD graphics, but works correct (array.[1]
= 32) on my NVidia GPGPU.
let run =
<@
fun (ndRange: Range1D) (x: ClArray<byte>) (y: ClArray<byte>) (array: ClArray<byte>) ->
let i = ndRange.GlobalID0
let a = x.[i]
let b = y.[i]
let res = a * b
let increase =
if res = 0uy
then None
else Some res
match increase with
| Some _ -> if i = 1 then array.[i] <- 1uy
| _ -> ()
match increase with
Some _ -> if i = 1 then array.[i] <- array.[i] + 4uy
| None -> if i = 1 then array.[i] <- array.[i] + 32uy
@>
Moreover, the following version demonstrates the same behevior.
let run =
<@
fun (ndRange: Range1D) (x: ClArray<byte>) (y: ClArray<byte>) (array: ClArray<byte>) ->
let i = ndRange.GlobalID0
let res = x.[i] * y.[i]
let mutable increase = None
if res = 0uy
then increase <- None
else increase <- Some res
match increase with
| Some _ -> if i = 1 then array.[i] <- 1uy
| _ -> ()
match increase with
Some _ -> array.[i] <- array.[i] + 4uy
| None -> array.[i] <- array.[i] + 32uy
@>
So, local array is not to blame in incorrect behavior. @artemgl What GPU do you use for tests?
And more simplified kernel:
let run =
<@
fun (ndRange: Range1D) (x: ClArray<byte>) (y: ClArray<byte>) (array: ClArray<byte>) ->
let i = ndRange.GlobalID0
let res = x.[i] * y.[i]
let mutable increase = 0
if res = 0uy
then increase <- 0
else increase <- 1
if increase = 1
then if i = 1 then array.[i] <- 1uy
if increase = 0
then array.[i] <- array.[i] + 32uy
else array.[i] <- array.[i] + 4uy
@>
For Intel array.[1]
is 33, for Nvidia --- 32. Manual evaluation of the similar kernel directly with OpenCL C shows the same result.
Finally, I think that it is a sort of undefined behavior on unsigned char overflow. It should not be an undefined behavior formally, so I guess that actually it is a driver (compiler) bug.
OpenCL kernel:
__kernel void brahmaKernel (__global uchar * x, __global uchar * y, __global uchar * array)
{
int i = get_global_id (0) ;
uchar res = (x [i] * y [i]) ;
int increase = 0;
if (res == 0)
{
increase = 0;
}
else
{
increase = 1;
}
if (increase == 1)
{
if (i == 1)
{
array [i] = 1 ;
}
}
if (increase == 0)
{
array [i] = array [i] + 32 ;
}
else
{
array [i] = array [i] + 4 ;
}
}
Host program:
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 32;
unsigned char *A = (unsigned char*)malloc(sizeof(unsigned char)*LIST_SIZE);
unsigned char *B = (unsigned char*)malloc(sizeof(unsigned char)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = 224;//0;//112;
B[i] = 240;
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(unsigned char), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(unsigned char), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE,
LIST_SIZE * sizeof(unsigned char), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(unsigned char), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(unsigned char), B, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
size_t log_size;
char *program_log;
if(ret != CL_SUCCESS) {
// If there's an error whilst building the program, dump the log
clGetProgramBuildInfo(program, &device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size+1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, &device_id, CL_PROGRAM_BUILD_LOG,
log_size+1, program_log, NULL);
printf("\n=== ERROR ===\n\n%s\n=============\n", program_log);
free(program_log);
exit(1);
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "brahmaKernel", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 32; // Divide work items into groups of 32
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
unsigned char *C = (unsigned char*)malloc(sizeof(unsigned char)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(unsigned char), C, 0, NULL, NULL);
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("result[%i] = %i\n", i, C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
Platforms (clinfo)
Platform Name Intel(R) OpenCL HD Graphics
Number of devices 1
Device Name Intel(R) UHD Graphics 620 [0x5917]
Device Vendor Intel(R) Corporation
Device Vendor ID 0x8086
Device Version OpenCL 3.0 NEO
Driver Version 22.28.23726.1
Device OpenCL C Version OpenCL C 1.2
And
Platform Name NVIDIA CUDA
Number of devices 1
Device Name NVIDIA GeForce MX150
Device Vendor NVIDIA Corporation
Device Vendor ID 0x10de
Device Version OpenCL 3.0 CUDA
Driver Version 470.141.03
Device OpenCL C Version OpenCL C 1.2
Can't reproduce with AMD and NVIDIA, array.[1] = 32uy
in both cases. I had a similar problem where the code seemed to be running on two contradicting branches. Adding if i < 32
helped because threads with larger id's were working and wrote to the same cells. Since workGroupSize is 32 I don't think this will work, but it can be worth trying. Atomic writings to array and printf may also be usefull to diagnose the problem.
Describe the bug Unexpected behavior when using a local array and
match
expression.To Reproduce
The value
array.[1]
is always33
after starting this code. This may mean that the code is executed in two contradictorymatch
branches.The problem disappears if the line
la.[lid] <- false
is deleted.The problem disappears as well if the line
| _ -> ()
is replaced by| _ -> if i = 1 then array.[i] <- 0uy
in the firstmatch
expression.Expected behavior The value
array.[1]
must be32
.