intel / compute-runtime

Intel® Graphics Compute Runtime for oneAPI Level Zero and OpenCL™ Driver
MIT License
1.12k stars 229 forks source link

[OpenCL] Using USM on Intel CPU Runtime along with `clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);` causes errors #748

Closed pvelesko closed 3 weeks ago

pvelesko commented 1 month ago

Running the same code on GPU does not cause issues.

#include <CL/cl.h>
#include <CL/cl_ext_intel.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define CHECK_ERROR(err) \
    if (err != CL_SUCCESS) { \
        printf("Error: %d at line %d\n", err, __LINE__); \
        if (err == -38) { \
            printf("Error description: CL_INVALID_MEM_OBJECT\n"); \
        } \
        else if (err == -52) { \
            printf("Error description: CL_INVALID_KERNEL_ARGS\n"); \
        } \
        else if (err == -59) { \
            printf("Error description: CL_INVALID_WORK_GROUP_SIZE\n"); \
        } \
        else if (err == -54) { \
            printf("Error description: CL_INVALID_WORK_ITEM_SIZE\n"); \
        } \
        exit(1); \
    }

const char* kernelSource = "\n" \
"__kernel void testKernel(__global int* data) {\n" \
"    int gid = get_global_id(0);\n" \
"    data[gid] = gid;\n" \
"}\n";

// Function pointer types for Intel USM extensions
typedef void* (CL_API_CALL *clSharedMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef void* (CL_API_CALL *clDeviceMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef void* (CL_API_CALL *clHostMemAllocINTEL_fn)(cl_context, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (CL_API_CALL *clMemFreeINTEL_fn)(cl_context, void*);
typedef cl_int (CL_API_CALL *clSetKernelArgMemPointerINTEL_fn)(cl_kernel, cl_uint, const void*);

// Structure to hold USM function pointers
struct USMFunctions {
    clSharedMemAllocINTEL_fn clSharedMemAllocINTEL;
    clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL;
    clHostMemAllocINTEL_fn clHostMemAllocINTEL;
    clMemFreeINTEL_fn clMemFreeINTEL;
    clSetKernelArgMemPointerINTEL_fn clSetKernelArgMemPointerINTEL;
};

// Function to check if the platform supports USM Intel extensions
bool check_usm_support(cl_platform_id platform) {
    size_t ext_size;
    cl_int err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 0, NULL, &ext_size);
    CHECK_ERROR(err);

    char* extensions = (char*)malloc(ext_size);
    err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, ext_size, extensions, NULL);
    CHECK_ERROR(err);

    bool usm_supported = strstr(extensions, "cl_intel_unified_shared_memory") != NULL;
    free(extensions);
    if (!usm_supported) {
        printf("Error: USM is not supported on this platform.\n");
        exit(1);
    }
    return usm_supported;
}

void run_kernel(cl_platform_id platform, cl_device_type device_type, const char* device_name, bool use_usm) {
    cl_int err;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    struct USMFunctions USM;
    void* data;

    printf("Running on %s using %s\n", device_name, use_usm ? "USM" : "SVM");

    // Check if the platform supports USM Intel extensions when USM is requested
    if (use_usm && !check_usm_support(platform)) {
        printf("USM Intel extensions are not supported on this platform. Falling back to SVM...\n");
        use_usm = false;
    }

    err = clGetDeviceIDs(platform, device_type, 1, &device, NULL);
    CHECK_ERROR(err);

    // Create context and command queue
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_ERROR(err);
    queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
    CHECK_ERROR(err);

    if (use_usm) {
        // Get USM function pointers
        USM.clSharedMemAllocINTEL = (clSharedMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clSharedMemAllocINTEL");
        USM.clDeviceMemAllocINTEL = (clDeviceMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL");
        USM.clHostMemAllocINTEL = (clHostMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clHostMemAllocINTEL");
        USM.clMemFreeINTEL = (clMemFreeINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
    }

    // Create and build program
    program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    CHECK_ERROR(err);
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    CHECK_ERROR(err);

    // Create kernel
    kernel = clCreateKernel(program, "testKernel", &err);
    CHECK_ERROR(err);

    // Allocate memory
    size_t dataSize = 512 * sizeof(int);
    if (use_usm) {
        data = USM.clSharedMemAllocINTEL(context, device, NULL, dataSize, 0, &err);
        CHECK_ERROR(err);
    } else {
        data = clSVMAlloc(context, CL_MEM_READ_WRITE, dataSize, 0);
        if (data == NULL) {
            printf("SVM allocation failed\n");
            exit(1);
        }
    }

    // Set kernel argument
    if (use_usm) {
        USM.clSetKernelArgMemPointerINTEL = (clSetKernelArgMemPointerINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clSetKernelArgMemPointerINTEL");
        err = USM.clSetKernelArgMemPointerINTEL(kernel, 0, data);
        CHECK_ERROR(err);

        // Set kernel execution info for USM pointers
        const void* usm_ptrs[] = {data};
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);
        CHECK_ERROR(err);
    } else {
        err = clSetKernelArgSVMPointer(kernel, 0, data);
    }
    CHECK_ERROR(err);

    // Execute kernel
    size_t globalSize = 512; // Match the dataSize
    size_t localSize;
    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &localSize, NULL);
    CHECK_ERROR(err);
    // Adjust globalSize to be a multiple of localSize
    globalSize = (globalSize + localSize - 1) / localSize * localSize;
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
    CHECK_ERROR(err);

    // Wait for completion
    err = clFinish(queue);
    CHECK_ERROR(err);

    // Free memory
    if (use_usm) {
        err = USM.clMemFreeINTEL(context, data);
        CHECK_ERROR(err);
    } else {
        clSVMFree(context, data);
    }

    // Clean up
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseDevice(device);
}

int main(int argc, char* argv[]) {
    cl_int err;
    cl_platform_id platform_gpu, platform_cpu;
    cl_uint num_platforms;
    bool use_usm = false;

    // Parse command line argument
    if (argc > 1 && strcmp(argv[1], "usm") == 0) {
        use_usm = true;
    }

    // Get platforms
    err = clGetPlatformIDs(0, NULL, &num_platforms);
    CHECK_ERROR(err);

    cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    err = clGetPlatformIDs(num_platforms, platforms, NULL);
    CHECK_ERROR(err);

    // Find GPU and CPU platforms
    for (cl_uint i = 0; i < num_platforms; i++) {
        char platform_name[128];
        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        CHECK_ERROR(err);
        if (strstr(platform_name, "Intel(R) OpenCL Graphics") != NULL) {
            platform_gpu = platforms[i];
            printf("Found GPU platform: %s\n", platform_name);
        }
        if (strstr(platform_name, "Intel(R) OpenCL") != NULL && strstr(platform_name, "Graphics") == NULL) {
            platform_cpu = platforms[i];
            printf("Found CPU platform: %s\n", platform_name);
        }
        if (platform_gpu == NULL || platform_cpu == NULL) {
            printf("Warning: Not all platforms found. GPU: %p, CPU: %p\n", (void*)platform_gpu, (void*)platform_cpu);
        }
    }
    free(platforms);

    // Run on GPU
    run_kernel(platform_gpu, CL_DEVICE_TYPE_GPU, "GPU", false);

    // Run on CPU
    run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", false);

    // Run on GPU
    run_kernel(platform_gpu, CL_DEVICE_TYPE_GPU, "GPU", true);

    // Run on CPU
    run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", true);

    return 0;
}
JablonskiMateusz commented 1 month ago

@cdai2

cdai2 commented 1 month ago

We will check this issue.

qichaogu commented 1 month ago

The test fails when running with USM enabled:

    // Run on CPU
    run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", true);

In run_kernel, clEnqueueNDRangeKernel returns error code -59:

err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);

The root cause is that USM allocations may be accessed by kernels indirectly but the kernel is not specified to access any shared USM allocation indirectly. Please refer to https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_unified_shared_memory.html#_using_unified_shared_memory_with_kernels

Solution: After calling clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL , call clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL:

        cl_bool param = true;
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), &param);
        CHECK_ERROR(err);

BTW, the error code -59 is CL_INVALID_OPERATION, not CL_INVALID_WORK_GROUP_SIZE.

pvelesko commented 1 month ago

@qichaogu

BTW, the error code -59 is CL_INVALID_OPERATION, not CL_INVALID_WORK_GROUP_SIZE.

Fixed.

Added the extra call as you instructed: no change.

#include <CL/cl.h>
#include <CL/cl_ext_intel.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define CHECK_ERROR(err) \
    if (err != CL_SUCCESS) { \
        printf("Error: %d at line %d\n", err, __LINE__); \
        if (err == -59) { \
            printf("Error description: CL_INVALID_OPERATION\n"); \
        } \
    }

const char* kernelSource = "\n" \
"__kernel void testKernel(__global int* data) {\n" \
"    int gid = get_global_id(0);\n" \
"    data[gid] = gid;\n" \
"}\n";

// Function pointer types for Intel USM extensions
typedef void* (CL_API_CALL *clSharedMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef void* (CL_API_CALL *clDeviceMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef void* (CL_API_CALL *clHostMemAllocINTEL_fn)(cl_context, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (CL_API_CALL *clMemFreeINTEL_fn)(cl_context, void*);
typedef cl_int (CL_API_CALL *clSetKernelArgMemPointerINTEL_fn)(cl_kernel, cl_uint, const void*);

// Structure to hold USM function pointers
struct USMFunctions {
    clSharedMemAllocINTEL_fn clSharedMemAllocINTEL;
    clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL;
    clHostMemAllocINTEL_fn clHostMemAllocINTEL;
    clMemFreeINTEL_fn clMemFreeINTEL;
    clSetKernelArgMemPointerINTEL_fn clSetKernelArgMemPointerINTEL;
};

// Function to check if the platform supports USM Intel extensions
bool check_usm_support(cl_platform_id platform) {
    size_t ext_size;
    cl_int err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, 0, NULL, &ext_size);
    CHECK_ERROR(err);

    char* extensions = (char*)malloc(ext_size);
    err = clGetPlatformInfo(platform, CL_PLATFORM_EXTENSIONS, ext_size, extensions, NULL);
    CHECK_ERROR(err);

    bool usm_supported = strstr(extensions, "cl_intel_unified_shared_memory") != NULL;
    free(extensions);
    if (!usm_supported) {
        printf("Error: USM is not supported on this platform.\n");
        exit(1);
    }
    return usm_supported;
}

void run_kernel(cl_platform_id platform, cl_device_type device_type, const char* device_name, bool use_usm) {
    cl_int err;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    struct USMFunctions USM;
    void* data;

    printf("Running on %s using %s\n", device_name, use_usm ? "USM" : "SVM");

    // Check if the platform supports USM Intel extensions when USM is requested
    if (use_usm && !check_usm_support(platform)) {
        printf("USM Intel extensions are not supported on this platform. Falling back to SVM...\n");
        use_usm = false;
    }

    err = clGetDeviceIDs(platform, device_type, 1, &device, NULL);
    CHECK_ERROR(err);

    // Create context and command queue
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_ERROR(err);
    queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
    CHECK_ERROR(err);

    if (use_usm) {
        // Get USM function pointers
        USM.clSharedMemAllocINTEL = (clSharedMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clSharedMemAllocINTEL");
        USM.clDeviceMemAllocINTEL = (clDeviceMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL");
        USM.clHostMemAllocINTEL = (clHostMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clHostMemAllocINTEL");
        USM.clMemFreeINTEL = (clMemFreeINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
    }

    // Create and build program
    program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
    CHECK_ERROR(err);
    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    CHECK_ERROR(err);

    // Create kernel
    kernel = clCreateKernel(program, "testKernel", &err);
    CHECK_ERROR(err);

    // Allocate memory
    size_t dataSize = 512 * sizeof(int);
    if (use_usm) {
        data = USM.clSharedMemAllocINTEL(context, device, NULL, dataSize, 0, &err);
        CHECK_ERROR(err);
    } else {
        data = clSVMAlloc(context, CL_MEM_READ_WRITE, dataSize, 0);
        if (data == NULL) {
            printf("SVM allocation failed\n");
            exit(1);
        }
    }

    // Set kernel argument
    if (use_usm) {
        USM.clSetKernelArgMemPointerINTEL = (clSetKernelArgMemPointerINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clSetKernelArgMemPointerINTEL");
        err = USM.clSetKernelArgMemPointerINTEL(kernel, 0, data);
        CHECK_ERROR(err);

        // Set kernel execution info for USM pointers
        const void* usm_ptrs[] = {data};
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);
        CHECK_ERROR(err);
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(void*), usm_ptrs);
        CHECK_ERROR(err);
    } else {
        err = clSetKernelArgSVMPointer(kernel, 0, data);
    }
    CHECK_ERROR(err);

    // Execute kernel
    size_t globalSize = 512; // Match the dataSize
    size_t localSize;
    err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &localSize, NULL);
    CHECK_ERROR(err);
    // Adjust globalSize to be a multiple of localSize
    globalSize = (globalSize + localSize - 1) / localSize * localSize;
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
    CHECK_ERROR(err);

    // Wait for completion
    err = clFinish(queue);
    CHECK_ERROR(err);

    // Free memory
    if (use_usm) {
        err = USM.clMemFreeINTEL(context, data);
        CHECK_ERROR(err);
    } else {
        clSVMFree(context, data);
    }

    // Clean up
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseDevice(device);
}

int main(int argc, char* argv[]) {
    cl_int err;
    cl_platform_id platform_gpu, platform_cpu;
    cl_uint num_platforms;
    bool use_usm = false;

    // Parse command line argument
    if (argc > 1 && strcmp(argv[1], "usm") == 0) {
        use_usm = true;
    }

    // Get platforms
    err = clGetPlatformIDs(0, NULL, &num_platforms);
    CHECK_ERROR(err);

    cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
    err = clGetPlatformIDs(num_platforms, platforms, NULL);
    CHECK_ERROR(err);

    // Find GPU and CPU platforms
    for (cl_uint i = 0; i < num_platforms; i++) {
        char platform_name[128];
        err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL);
        CHECK_ERROR(err);
        if (strstr(platform_name, "Intel(R) OpenCL Graphics") != NULL) {
            platform_gpu = platforms[i];
            printf("Found GPU platform: %s\n", platform_name);
        }
        if (strstr(platform_name, "Intel(R) OpenCL") != NULL && strstr(platform_name, "Graphics") == NULL) {
            platform_cpu = platforms[i];
            printf("Found CPU platform: %s\n", platform_name);
        }
        if (platform_gpu == NULL || platform_cpu == NULL) {
            printf("Warning: Not all platforms found. GPU: %p, CPU: %p\n", (void*)platform_gpu, (void*)platform_cpu);
        }
    }
    free(platforms);

    // Run on GPU
    run_kernel(platform_gpu, CL_DEVICE_TYPE_GPU, "GPU", false);

    // Run on CPU
    run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", false);

    // Run on GPU
    run_kernel(platform_gpu, CL_DEVICE_TYPE_GPU, "GPU", true);

    // Run on CPU
    run_kernel(platform_cpu, CL_DEVICE_TYPE_CPU, "CPU", true);

    return 0;
}
qichaogu commented 1 month ago

@pvelesko Pay attention to the parameter type, cl_bool, not a pointer you're using. See the code in my last comment.

pvelesko commented 1 month ago

@qichaogu I copy pasted your code - it gives me error 30 - CL_INVALID_VALUE

        // Set kernel execution info for USM pointers
        cl_bool param = true;
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(cl_bool), &param);
        CHECK_ERROR(err);
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), &param);
        CHECK_ERROR(err);
qichaogu commented 1 month ago

@pvelesko Your original call to clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL is good. So, do not change it. The only thing you need is to add a call to clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL. So, your code should look like this:

        const void* usm_ptrs[] = {data};
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);
        CHECK_ERROR(err);
        cl_bool param = true;
        err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), &param);
        CHECK_ERROR(err);
pvelesko commented 1 month ago

Ah! Thank you very much it's all working now! @qichaogu

pvelesko commented 1 month ago

@qichaogu

Still seeing issues with a reproducer that more closely matches our actual use case. In this particular example, using CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL causes an error on ndrange enqueue

#include <CL/cl.h>
#include <CL/cl_ext_intel.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>

#define CHECK_ERROR(err) \
    if (err != CL_SUCCESS) { \
        printf("Error: %d at line %d\n", err, __LINE__); \
        exit(1); \
    }

// Function pointer types for Intel USM extensions
typedef void* (CL_API_CALL *clDeviceMemAllocINTEL_fn)(cl_context, cl_device_id, const cl_mem_properties_intel*, size_t, cl_uint, cl_int*);
typedef cl_int (CL_API_CALL *clMemFreeINTEL_fn)(cl_context, void*);
typedef cl_int (CL_API_CALL *clEnqueueMemcpyINTEL_fn)(cl_command_queue, cl_bool, void*, const void*, size_t, cl_uint, const cl_event*, cl_event*);
typedef cl_int (CL_API_CALL *clSetKernelArg_fn)(cl_kernel, cl_uint, size_t, const void*);

// Function to read file contents
char* read_file(const char* filename, size_t* length);

int main() {
    cl_int err;
    cl_platform_id platform;
    cl_device_id device;
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;

    // Get platform and device
    err = clGetPlatformIDs(1, &platform, NULL);
    CHECK_ERROR(err);
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
    CHECK_ERROR(err);

    // Create context and command queue
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
    CHECK_ERROR(err);
    queue = clCreateCommandQueueWithProperties(context, device, NULL, &err);
    CHECK_ERROR(err);

    // Load and create program with SPIR-V IL
    size_t il_length;
    const unsigned char* il_binary = (const unsigned char*)read_file("firstTouch.spv", &il_length);
    if (!il_binary) {
        printf("Failed to read SPIR-V IL file\n");
        exit(1);
    }

    program = clCreateProgramWithIL(context, il_binary, il_length, &err);
    CHECK_ERROR(err);

    free((void*)il_binary);

    err = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
    if (err != CL_SUCCESS) {
        size_t log_size;
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char *log = (char *) malloc(log_size);
        clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        printf("Error: Failed to build program. Build log:\n%s\n", log);
        free(log);
        exit(1);
    }

    // Create kernel
    kernel = clCreateKernel(program, "_Z6setOne4Data", &err);
    CHECK_ERROR(err);

    // Get USM function pointers
    clDeviceMemAllocINTEL_fn clDeviceMemAllocINTEL = (clDeviceMemAllocINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL");
    clMemFreeINTEL_fn clMemFreeINTEL = (clMemFreeINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
    clEnqueueMemcpyINTEL_fn clEnqueueMemcpyINTEL = (clEnqueueMemcpyINTEL_fn)clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");

    // Allocate memory using USM
    int *A_d = (int*)clDeviceMemAllocINTEL(context, device, NULL, sizeof(int), 0, &err);
    CHECK_ERROR(err);

    // Create and initialize Data structure
    struct Data {
        int *A_d;
    } data;
    data.A_d = A_d;

    // Set kernel argument
    err = clSetKernelArg(kernel, 0, sizeof(struct Data), &data);
    CHECK_ERROR(err);

    // Set kernel execution info for USM pointers
    void* usm_ptrs[] = {A_d};
    err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(void*), usm_ptrs);
    CHECK_ERROR(err);
    cl_bool param = true;
    err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), &param);
    CHECK_ERROR(err);

    // Execute kernel
    size_t globalSize = 1;
    size_t localSize = 1;
    err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
    CHECK_ERROR(err);

    // Wait for completion
    err = clFinish(queue);
    CHECK_ERROR(err);

    // Copy result back to host
    int A_h[1] = {0};
    err = clEnqueueMemcpyINTEL(queue, CL_TRUE, A_h, A_d, sizeof(int), 0, NULL, NULL);
    CHECK_ERROR(err);

    // Check result
    bool Failed = A_h[0] != 1;
    printf(Failed ? "FAILED\n" : "PASSED\n");

    // Clean up
    err = clMemFreeINTEL(context, A_d);
    CHECK_ERROR(err);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);
    clReleaseDevice(device);

    return Failed;
}

// Function to read file contents
char* read_file(const char* filename, size_t* length) {
    FILE* file = fopen(filename, "rb");
    if (!file) {
        return NULL;
    }

    fseek(file, 0, SEEK_END);
    *length = ftell(file);
    rewind(file);

    char* content = (char*)malloc(*length);
    if (!content) {
        fclose(file);
        return NULL;
    }

    fread(content, 1, *length, file);
    fclose(file);

    return content;
}

Kernel:

firstTouch.spv.zip

AdamCetnerowski commented 1 month ago

Is this a bug reported against CPU or GPU runtime? This github is for GPU runtime only.

pvelesko commented 1 month ago

This seems like a CPU runtime issue. Where should I report this issue to? @AdamCetnerowski

AdamCetnerowski commented 1 month ago

Please use these forums: https://community.intel.com/t5/OpenCL-for-CPU/bd-p/opencl

qichaogu commented 3 weeks ago

@pvelesko Your second example uses clDeviceMemAllocINTEL to allocate Unified Shared Memory specific to the cpu device, so you don't need to call clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL. Instead, you need to call clSetKernelExecInfo with CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL.

So, the solution for your second example is to replace

err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), &param);

with

err = clSetKernelExecInfo(kernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, sizeof(cl_bool), &param);