Open ghost opened 5 years ago
One thing I can see is that you should be up to 16 times faster if you actually make use of the native vector width by processing uchar16
instead of uchar
(and then in return run 16 times fewer iterations of the kernel):
__kernel void brightness(__global const uchar16 *A, uchar const B, __global uchar16 *C) {
int i = get_global_id(0);
C[i] = (A[i]+B) >= (uchar16)255 ? (uchar16)255 : (A[i]+B);
}
This is something that I looked into trying to detect and automatically vectorize such code, but there is no proper implementation yet.
Hello, I have very similar problem. I have bought Raspberry Pi zero and started to learn OpenCL. I have never work with linux and OpenCL therefore I thought that i do something wrong with linux. But now I am reading that somebody has the same problem. I made very simply kernel which only multiply two arrays. The kernel is executed 1200000x and I measure the time. Result is about 731.552787ms. It is very long. When i perform the same operation on CPU as you can see in code a get the result 106.329344 ms it means that CPU is almost 7 times faster than GPU.
I have built kernel and main by:
vc4c --bin -o kernel.clbin kernel.cl
gcc -o main main.c -l OpenCL
I know that I can use float16 but still the GPU should be 10x faster than CPU ant it is not. Am I doing something wrong or is it stander performance of Video Core 4 on Raspberry Zero? Or is there a possibility that I have badly built and installed VC4C, VC4CL and VC4CLStdLib or other SW? Thank you for your support.
Kernel:
__kernel void multiplication(__global float* A, __global float* B, __global float* C)
{
int id = get_global_id(0);
C[id] = A[id] * B[id];
}
main:
#include <stdio.h>
#include <stdlib.h>
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/cl.h>
#include <time.h>
#define ARRAY_LENGTH 1200000
#define MAX_BINARY_SIZE (0x100000)
int main()
{
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem Amobj = NULL;
cl_mem Bmobj = NULL;
cl_mem Cmobj = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_event ev;
cl_ulong measurement_start, measurement_stop;
struct timespec mes_start, mes_stop;
cl_double duration;
cl_uint i, j;
cl_float *A;
cl_float *B;
cl_float *C;
printf("Open kernel bin\n");
FILE *fp;
char fileName[] = "./kernel.clbin";
size_t binary_size;
unsigned char *binary_buf;
cl_int binary_status;
printf("Load kernel bin\n");
/* Load kernel binary */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
binary_buf = (unsigned char *)malloc(MAX_BINARY_SIZE);
binary_size = fread(binary_buf, 1, MAX_BINARY_SIZE, fp);
fclose(fp);
printf("Kernel binary length: %d byte \n", binary_size);
A = (cl_float *)malloc(ARRAY_LENGTH*sizeof(cl_float));
B = (cl_float *)malloc(ARRAY_LENGTH*sizeof(cl_float));
C = (cl_float *)malloc(ARRAY_LENGTH*sizeof(cl_float));
/* Initialize input data */
printf("Creating input data\n");
for (i=0; i < ARRAY_LENGTH; i++)
{
A[i] = i;
B[i] = i;
}
/* Get Platform/Device Information */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
printf("Number of platforms: %d \n", ret_num_platforms);
printf("Number of devices: %d \n", ret_num_devices);
/* Create OpenCL Context */
printf("Create OpenCL Context\n");
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
/* Create command queue */
printf("Create command queue\n");
command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
/* Create Buffer Object */
printf("Create Buffer Object\n");
Amobj = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_LENGTH * sizeof(cl_float), NULL, &ret);
Bmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_LENGTH * sizeof(cl_float), NULL, &ret);
Cmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, ARRAY_LENGTH * sizeof(cl_float), NULL, &ret);
/* Copy input data to the memory buffer */
printf("Copy input data to the memory buffer\n");
ret = clEnqueueWriteBuffer(command_queue, Amobj, CL_TRUE, 0, ARRAY_LENGTH*sizeof(cl_float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, Bmobj, CL_TRUE, 0, ARRAY_LENGTH*sizeof(cl_float), B, 0, NULL, NULL);
/* Create kernel program from bin file*/
printf("Create kernel program from bin file\n");
program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, (const unsigned char **)&binary_buf, &binary_status, &ret);
/* Build Kernel Program */
printf("Build Kernel Program\n");
clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create data parallel OpenCL kernel */
printf("Create data parallel OpenCL kernel\n");
kernel = clCreateKernel(program, "multiplication", &ret);
/* Set OpenCL kernel arguments */
printf("Set OpenCL kernel arguments\n");
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&Amobj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&Bmobj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&Cmobj);
size_t global_item_size = ARRAY_LENGTH;
size_t local_item_size = 12;
/* Execute OpenCL kernel as data parallel */
printf("Execute OpenCL kernel as data parallel\n");
printf("Execute OpenCL kernel as data parallel %d x \n", i);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &ev);
clWaitForEvents(1,&ev);
clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &measurement_start, NULL);
clGetEventProfilingInfo(ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &measurement_stop, NULL);
duration = (cl_double)(measurement_stop - measurement_start)*(cl_double)(1e-06);
printf("Execution time %f ms \n", duration);
/* Transfer result to host */
printf("Transfer result to host\n");
ret = clEnqueueReadBuffer(command_queue, Cmobj, CL_TRUE, 0, ARRAY_LENGTH*sizeof(cl_float), C, 0, NULL, NULL);
clock_gettime(CLOCK_REALTIME, &mes_start);
for(i = 0; i < ARRAY_LENGTH; i++)
{
C[i] = A[i] * B[i];
}
clock_gettime(CLOCK_REALTIME, &mes_stop);
duration =((mes_stop.tv_nsec + (cl_double)mes_stop.tv_sec* 1000000000) - (mes_start.tv_nsec + (cl_double)mes_start.tv_sec * 1000000000))/1000000;
printf("Execution time %f ms \n", duration);
/* Display Results (only 10 last elements) */
printf("Display Results\n");
for (i = ARRAY_LENGTH-10; i < ARRAY_LENGTH; i++)
{
printf("%f x %f = %f \n", A[i], B[i], C[i]);
}
/* Finalization */
printf("Finalization\n");
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(Amobj);
ret = clReleaseMemObject(Bmobj);
ret = clReleaseMemObject(Cmobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
Martin
I know that I can use float16 but still the GPU should be 10x faster than CPU ant it is not.
It probably won't ever be 10x faster than the CPU value unless you actually use the parallelization features of the GPU.
Lets try to do some estimations:
The superior processing power of the GPU (24GFLOPS GPU vs. 1GFLOP CPU theoretical maximum) can only be used when the parallelization features (16-way SIMD vector on 12 QPU processors and preferably using both ALUs) are actually utilized.
Lets assume we use float16
vectors instead, i.e. this kernel code:
__kernel void multiplication(__global float16* A, __global float16* B, __global float16* C)
{
int id = get_global_id(0);
C[id] = A[id] * B[id];
}
This gives us following approximation:
You won't reach that theoretical time, because a large part of the original 700ms will be CPU-side and scheduling overhead, but I assume you should be able significantly lower the execution time.
Thank you very much for your explanation. Now it makes sense I didn't know that there is so big overhead. I read that Video Card 4 has maximal processing power 24GFLOPS and I expected some overhead but still I thought that I would be able to multiply 3giga samples per the second.
Thank you
The maximum actual performance I ever measured was just above 8GFLOPs running the clpeak floating-point test. So from the calculation you can definitively achieve that.
The problem with your kernel is that you have 3 memory accesses (2 reads and 1 write) per hand full of arithmetic calculations. So most of the time will be spent waiting for IO, since the memory access from the VideoCore IV is not that fast.
Now I modified the kernel so that I removed memory operation and the time for 1.2M operations took 28ms. Memory operations are very expensive. But why the CPU has faster memory access? It is the same memory. Can I somehow speed up memory access?
Thank you.
Hey, I'm trying to increase the brightness of a greyscale image using the kernel below. My Problem is, I want to execute this operation on a 640x480 image at 25 fps, this means it cant take more than roughly 15 ms, but the execution of this kernel takes far too long.
Here are the results I got using OpenCL's profiling events for the kernel below: Frames captured: 100 Average FPS: 9.2 Average time per frame: 109.21 ms Average processing time: 96.21 ms OpenCL clEnqueueWriteBuffer: 1.792 ms <---- writing the input array to the GPU memory OpenCL Kernel execution time: 85.851 ms <---- kernel execution time OpenCL clEnqueueReadBuffer: 1.581 ms <---- reading the GPU output memory into the output array
even more strange is the execution time it took when I changed the line C[i] = (A[i]+B) >= 255 ? 255 : (A[i]+B) to C[i] = A[i];
Frames captured: 160 Average FPS: 5.0 Average time per frame: 199.57 ms Average processing time: 187.72 ms OpenCL clEnqueueWriteBuffer: 1.266 ms
OpenCL Kernel execution time: 177.103 ms
OpenCL clEnqueueReadBuffer: 1.656 ms
the Kernel:
snippets of the cpp file (not including the GPU setup code) :
What am I doing wrong?
Thanks FMaier