Closed pvelesko closed 3 weeks ago
@cdai2
We will check this issue.
The test fails when running with USM enabled:
// Run on CPU
run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", true);
In run_kernel
, clEnqueueNDRangeKernel
returns error code -59
:
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
The root cause is that USM allocations may be accessed by kernels indirectly but the kernel is not specified to access any shared USM allocation indirectly. Please refer to https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_unified_shared_memory.html#_using_unified_shared_memory_with_kernels
Solution:
After calling clSetKernelExecInfo
with CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL
, call clSetKernelExecInfo
with CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL
:
cl_bool param = true;
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), ¶m);
CHECK_ERROR(err);
BTW, the error code -59
is CL_INVALID_OPERATION
, not CL_INVALID_WORK_GROUP_SIZE
.
@qichaogu
BTW, the error code -59 is CL_INVALID_OPERATION, not CL_INVALID_WORK_GROUP_SIZE.
Fixed.
Added the extra call as you instructed: no change.
#include <CL/cl.h>
#include <CL/cl_ext_intel.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CHECK_ERROR(err) \
if (err != CL_SUCCESS) { \
printf("Error: %d at line %d\n", err, __LINE__); \
if (err == -59) { \
printf("Error description: CL_INVALID_OPERATION\n"); \
} \
}
const char* kernelSource = "\n" \
"__kernel void testKernel(__global int* data) {\n" \
" int gid = get_global_id(0);\n" \
" data[gid] = gid;\n" \
"}\n";
// Function pointer types for Intel USM extensions
typedef void* (CL_API_CALL *clSharedMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef void* (CL_API_CALL *clDeviceMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef void* (CL_API_CALL *clHostMemAllocINTEL_fn)(cl_context, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (CL_API_CALL *clMemFreeINTEL_fn)(cl_context, void*);
typedef cl_int (CL_API_CALL *clSetKernelArgMemPointerINTEL_fn)(cl_kernel, cl_uint, const void*);
// Structure to hold USM function pointers
struct USMFunctions {
clSharedMemAllocINTEL_fn clSharedMemAllocINTEL;
clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL;
clHostMemAllocINTEL_fn clHostMemAllocINTEL;
clMemFreeINTEL_fn clMemFreeINTEL;
clSetKernelArgMemPointerINTEL_fn clSetKernelArgMemPointerINTEL;
};
// Function to check if the platform supports USM Intel extensions
bool check_usm_support(cl_platform_id platform) {
size_t ext_size;
cl_int err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 0, NULL, &ext_size);
CHECK_ERROR(err);
char* extensions = (char*)malloc(ext_size);
err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, ext_size, extensions, NULL);
CHECK_ERROR(err);
bool usm_supported = strstr(extensions, "cl_intel_unified_shared_memory") != NULL;
free(extensions);
if (!usm_supported) {
printf("Error: USM is not supported on this platform.\n");
exit(1);
}
return usm_supported;
}
void run_kernel(cl_platform_id platform, cl_device_type device_type, const char* device_name, bool use_usm) {
cl_int err;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
struct USMFunctions USM;
void* data;
printf("Running on %s using %s\n", device_name, use_usm ? "USM" : "SVM");
// Check if the platform supports USM Intel extensions when USM is requested
if (use_usm && !check_usm_support(platform)) {
printf("USM Intel extensions are not supported on this platform. Falling back to SVM...\n");
use_usm = false;
}
err = clGetDeviceIDs(platform, device_type, 1, &device, NULL);
CHECK_ERROR(err);
// Create context and command queue
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CHECK_ERROR(err);
queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
CHECK_ERROR(err);
if (use_usm) {
// Get USM function pointers
USM.clSharedMemAllocINTEL = (clSharedMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clSharedMemAllocINTEL");
USM.clDeviceMemAllocINTEL = (clDeviceMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL");
USM.clHostMemAllocINTEL = (clHostMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clHostMemAllocINTEL");
USM.clMemFreeINTEL = (clMemFreeINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
}
// Create and build program
program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
CHECK_ERROR(err);
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
CHECK_ERROR(err);
// Create kernel
kernel = clCreateKernel(program, "testKernel", &err);
CHECK_ERROR(err);
// Allocate memory
size_t dataSize = 512 * sizeof(int);
if (use_usm) {
data = USM.clSharedMemAllocINTEL(context, device, NULL, dataSize, 0, &err);
CHECK_ERROR(err);
} else {
data = clSVMAlloc(context, CL_MEM_READ_WRITE, dataSize, 0);
if (data == NULL) {
printf("SVM allocation failed\n");
exit(1);
}
}
// Set kernel argument
if (use_usm) {
USM.clSetKernelArgMemPointerINTEL = (clSetKernelArgMemPointerINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clSetKernelArgMemPointerINTEL");
err = USM.clSetKernelArgMemPointerINTEL(kernel, 0, data);
CHECK_ERROR(err);
// Set kernel execution info for USM pointers
const void* usm_ptrs[] = {data};
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);
CHECK_ERROR(err);
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(void*), usm_ptrs);
CHECK_ERROR(err);
} else {
err = clSetKernelArgSVMPointer(kernel, 0, data);
}
CHECK_ERROR(err);
// Execute kernel
size_t globalSize = 512; // Match the dataSize
size_t localSize;
err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &localSize, NULL);
CHECK_ERROR(err);
// Adjust globalSize to be a multiple of localSize
globalSize = (globalSize + localSize - 1) / localSize * localSize;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
CHECK_ERROR(err);
// Wait for completion
err = clFinish(queue);
CHECK_ERROR(err);
// Free memory
if (use_usm) {
err = USM.clMemFreeINTEL(context, data);
CHECK_ERROR(err);
} else {
clSVMFree(context, data);
}
// Clean up
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseDevice(device);
}
int main(int argc, char* argv[]) {
cl_int err;
cl_platform_id platform_gpu, platform_cpu;
cl_uint num_platforms;
bool use_usm = false;
// Parse command line argument
if (argc > 1 && strcmp(argv[1], "usm") == 0) {
use_usm = true;
}
// Get platforms
err = clGetPlatformIDs(0, NULL, &num_platforms);
CHECK_ERROR(err);
cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
err = clGetPlatformIDs(num_platforms, platforms, NULL);
CHECK_ERROR(err);
// Find GPU and CPU platforms
for (cl_uint i = 0; i < num_platforms; i++) {
char platform_name[128];
err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
CHECK_ERROR(err);
if (strstr(platform_name, "Intel(R) OpenCL Graphics") != NULL) {
platform_gpu = platforms[i];
printf("Found GPU platform: %s\n", platform_name);
}
if (strstr(platform_name, "Intel(R) OpenCL") != NULL && strstr(platform_name, "Graphics") == NULL) {
platform_cpu = platforms[i];
printf("Found CPU platform: %s\n", platform_name);
}
if (platform_gpu == NULL || platform_cpu == NULL) {
printf("Warning: Not all platforms found. GPU: %p, CPU: %p\n", (void*)platform_gpu, (void*)platform_cpu);
}
}
free(platforms);
// Run on GPU
run_kernel(platform_gpu, CL_DEVICE_TYPE_GPU, "GPU", false);
// Run on CPU
run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", false);
// Run on GPU
run_kernel(platform_gpu, CL_DEVICE_TYPE_GPU, "GPU", true);
// Run on CPU
run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", true);
return 0;
}
@pvelesko Pay attention to the parameter type, cl_bool
, not a pointer you're using. See the code in my last comment.
@qichaogu
I copy pasted your code - it gives me error 30 - CL_INVALID_VALUE
// Set kernel execution info for USM pointers
cl_bool param = true;
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(cl_bool), ¶m);
CHECK_ERROR(err);
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), ¶m);
CHECK_ERROR(err);
@pvelesko Your original call to clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL
is good. So, do not change it.
The only thing you need is to add a call to clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL
. So, your code should look like this:
const void* usm_ptrs[] = {data};
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);
CHECK_ERROR(err);
cl_bool param = true;
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), ¶m);
CHECK_ERROR(err);
Ah! Thank you very much it's all working now! @qichaogu
@qichaogu
Still seeing issues with a reproducer that more closely matches our actual use case. In this particular example, using CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL
causes an error on ndrange enqueue
#include <CL/cl.h>
#include <CL/cl_ext_intel.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#define CHECK_ERROR(err) \
if (err != CL_SUCCESS) { \
printf("Error: %d at line %d\n", err, __LINE__); \
exit(1); \
}
// Function pointer types for Intel USM extensions
typedef void* (CL_API_CALL *clDeviceMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (CL_API_CALL *clMemFreeINTEL_fn)(cl_context, void*);
typedef cl_int (CL_API_CALL *clEnqueueMemcpyINTEL_fn)(cl_command_queue, cl_bool, void*, const void*, size_t, cl_uint, const cl_event*, cl_event*);
typedef cl_int (CL_API_CALL *clSetKernelArg_fn)(cl_kernel, cl_uint, size_t, const void*);
// Function to read file contents
char* read_file(const char* filename, size_t* length);
int main() {
cl_int err;
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
// Get platform and device
err = clGetPlatformIDs(1, &platform, NULL);
CHECK_ERROR(err);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
CHECK_ERROR(err);
// Create context and command queue
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CHECK_ERROR(err);
queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
CHECK_ERROR(err);
// Load and create program with SPIR-V IL
size_t il_length;
const unsigned char* il_binary = (const unsigned char*)read_file("firstTouch.spv", &il_length);
if (!il_binary) {
printf("Failed to read SPIR-V IL file\n");
exit(1);
}
program = clCreateProgramWithIL(context, il_binary, il_length, &err);
CHECK_ERROR(err);
free((void*)il_binary);
err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
if (err != CL_SUCCESS) {
size_t log_size;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("Error: Failed to build program. Build log:\n%s\n", log);
free(log);
exit(1);
}
// Create kernel
kernel = clCreateKernel(program, "_Z6setOne4Data", &err);
CHECK_ERROR(err);
// Get USM function pointers
clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL = (clDeviceMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL");
clMemFreeINTEL_fn clMemFreeINTEL = (clMemFreeINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
clEnqueueMemcpyINTEL_fn clEnqueueMemcpyINTEL = (clEnqueueMemcpyINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");
// Allocate memory using USM
int *A_d = (int*)clDeviceMemAllocINTEL(context, device, NULL, sizeof(int), 0, &err);
CHECK_ERROR(err);
// Create and initialize Data structure
struct Data {
int *A_d;
} data;
data.A_d = A_d;
// Set kernel argument
err = clSetKernelArg(kernel, 0, sizeof(struct Data), &data);
CHECK_ERROR(err);
// Set kernel execution info for USM pointers
void* usm_ptrs[] = {A_d};
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);
CHECK_ERROR(err);
cl_bool param = true;
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), ¶m);
CHECK_ERROR(err);
// Execute kernel
size_t globalSize = 1;
size_t localSize = 1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
CHECK_ERROR(err);
// Wait for completion
err = clFinish(queue);
CHECK_ERROR(err);
// Copy result back to host
int A_h[1] = {0};
err = clEnqueueMemcpyINTEL(queue, CL_TRUE, A_h, A_d, sizeof(int), 0, NULL, NULL);
CHECK_ERROR(err);
// Check result
bool Failed = A_h[0] != 1;
printf(Failed ? "FAILED\n" : "PASSED\n");
// Clean up
err = clMemFreeINTEL(context, A_d);
CHECK_ERROR(err);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseDevice(device);
return Failed;
}
// Function to read file contents
char* read_file(const char* filename, size_t* length) {
FILE* file = fopen(filename, "rb");
if (!file) {
return NULL;
}
fseek(file, 0, SEEK_END);
*length = ftell(file);
rewind(file);
char* content = (char*)malloc(*length);
if (!content) {
fclose(file);
return NULL;
}
fread(content, 1, *length, file);
fclose(file);
return content;
}
Kernel:
Is this a bug reported against CPU or GPU runtime? This github is for GPU runtime only.
This seems like a CPU runtime issue. Where should I report this issue to? @AdamCetnerowski
Please use these forums: https://community.intel.com/t5/OpenCL-for-CPU/bd-p/opencl
@pvelesko Your second example uses clDeviceMemAllocINTEL
to allocate Unified Shared Memory specific to the cpu device, so you don't need to call clSetKernelExecInfo
with CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL
. Instead, you need to call clSetKernelExecInfo
with CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL
.
So, the solution for your second example is to replace
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), ¶m);
with
err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, sizeof(cl_bool), ¶m);
Running the same code on GPU does not cause issues.