doe300 / VC4CL

OpenCL implementation running on the VideoCore IV GPU of the Raspberry Pi models
MIT License
729 stars 81 forks source link

clBuildProgram ERROR #42

Open sowson opened 6 years ago

sowson commented 6 years ago

Hi I am working on RPI support on the fork of DarkNet at https://github.com/sowson/darknet in Makefile I enabled RPI=1 and disabled OPENCV=0 after installation of VC4CL and make the project I am trying to run it but without success. One of OpenCL program did not compile in runtime others are fine.

root@raspberrypi:~/cifar# ../darknet/darknet classifier predict cfg/cifar.data cfg/cifar_small_test.cfg backup/cifar_small.backup data/cifar/test/4882_frog.png Device ID: 0 Device name: VideoCore IV GPU Device vendor: Broadcom Device opencl availability: OpenCL 1.2 VC4CL 0.4 Device opencl used: 0.4 Device double precision: NO Device max group size: 12 Device address bits: 32 opencl_load: could not compile. error: CL_UNKNOWN_ERROR CL_PROGRAM_BUILD_LOG: [W] Thu Aug 9 08:06:40 2018: Register conflict resolver has exceeded its maximum rounds, there might still be errors! [E] Thu Aug 9 08:06:40 2018: Error assigning local to register: %call59.%b.1 [E] Thu Aug 9 08:06:40 2018: (1) /usr/local/lib/libVC4CC.so.1.2 : vc4c::CompilationError::CompilationError(vc4c::CompilationStep, std::cxx11::basic_string<char, std::char_traits, std::allocator > const&)+0xac [0x7695f918] [E] Thu Aug 9 08:06:40 2018: (2) /usr/local/lib/libVC4CC.so.1.2 : vc4c::qpu_asm::GraphColoring::toRegisterMap() const+0x150 [0x76a4d3b8] [E] Thu Aug 9 08:06:40 2018: (3) /usr/local/lib/libVC4CC.so.1.2 : vc4c::qpu_asm::CodeGenerator::generateInstructions(vc4c::Method&)+0x474 [0x76a3c900] [E] Thu Aug 9 08:06:40 2018: (4) /usr/local/lib/libVC4CC.so.1.2 : +0x4983ac [0x769613ac] [E] Thu Aug 9 08:06:40 2018: (5) /usr/local/lib/libVC4CC.so.1.2 : +0x498588 [0x76961588] [E] Thu Aug 9 08:06:40 2018: (6) /usr/local/lib/libVC4CC.so.1.2 : +0x49942c [0x7696242c] [E] Thu Aug 9 08:06:40 2018: (7) /usr/local/lib/libVC4CC.so.1.2 : std::function<void (vc4c::Method const&)>::operator()(vc4c::Method const&) const+0x54 [0x76966214] [E] Thu Aug 9 08:06:40 2018: (8) /usr/local/lib/libVC4CC.so.1.2 : void vc4c::BackgroundWorker::scheduleAll<vc4c::Method, std::vector<vc4c::Method, std::allocator<vc4c::Method> > >(std::vector<vc4c::Method, std::allocator<vc4c::Method> > const&, std::function<void (vc4c::Method const&)> const&, std::cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda()#1}::operator()() const+0xc8 [0x76964a1c] [E] Thu Aug 9 08:06:40 2018: (9) /usr/local/lib/libVC4CC.so.1.2 : std::_Function_handler<void (), void vc4c::BackgroundWorker::scheduleAll<vc4c::Method, std::vector<vc4c::Method, std::allocator<vc4c::Method> > >(std::vector<vc4c::Method, std::allocator<vc4c::Method> > const&, std::function<void (vc4c::Method const&)> const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda()#1}>::_M_invoke(std::_Any_data const&)+0x24 [0x769698b8] [E] Thu Aug 9 08:06:40 2018: (10) /usr/local/lib/libVC4CC.so.1.2 : std::function<void ()>::operator()() const+0x40 [0x769508e4] [E] Thu Aug 9 08:06:40 2018: (11) /usr/local/lib/libVC4CC.so.1.2 : +0x486a70 [0x7694fa70] [E] Thu Aug 9 08:06:40 2018: (12) /usr/local/lib/libVC4CC.so.1.2 : +0x48735c [0x7695035c] [E] Thu Aug 9 08:06:40 2018: (13) /usr/local/lib/libVC4CC.so.1.2 : +0x4872c8 [0x769502c8] [E] Thu Aug 9 08:06:40 2018: (14) /usr/local/lib/libVC4CC.so.1.2 : +0x4872a0 [0x769502a0] [E] Thu Aug 9 08:06:40 2018: (15) /usr/lib/arm-linux-gnueabihf/libstdc++.so.6 : +0x9c9dc [0x763df9dc] [E] Thu Aug 9 08:06:40 2018: Background worker threw error: Label/Register Mapping: There are erroneous register-associations! [E] Thu Aug 9 08:06:40 2018: While running worker task: CodeGenerator [E] Thu Aug 9 08:06:40 2018: Compiler threw exception: Label/Register Mapping: There are erroneous register-associations!

CODE: attribute((noinline))` float get_pixel_kernel(global float *image, int w, int h, int x, int y, int c); attribute((noinline)) float4 rgb_to_hsv_kernel(float4 rgb); attribute((noinline)) float4 hsv_to_rgb_kernel(float4 hsv); attribute((noinline)) float bilinear_interpolate_kernel(global float image, int w, int h, float x, float y, int c); attribute((noinline)) float get_pixel_kernel(__global float image, int w, int h, int x, int y, int c) { if(x < 0 || x >= w || y < 0 || y >= h) return 0; return image[x + w(y + ch)]; } attribute((noinline)) float4 rgb_to_hsv_kernel(float4 rgb) { float r = rgb.x; float g = rgb.y; float b = rgb.z; float h, s, v; float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b); float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b); float delta = max - min; v = max; if(max == 0){ s = 0; h = -1; }else{ s = delta/max; if(r == max){ h = (g - b) / delta; } else if (g == max) { h = 2 + (b - r) / delta; } else { h = 4 + (r - g) / delta; } if (h < 0) h += 6; } return (float4) (h, s, v, 0.0); } attribute((noinline)) float4 hsv_to_rgb_kernel(float4 hsv) { float h = hsv.x; float s = hsv.y; float v = hsv.z; float r, g, b; float f, p, q, t; if (s == 0) { r = g = b = v; } else { int index = (int) floor(h); f = h - index; p = v(1-s); q = v(1-sf); t = v(1-s(1-f)); if(index == 0){ r = v; g = t; b = p; } else if(index == 1){ r = q; g = v; b = p; } else if(index == 2){ r = p; g = v; b = t; } else if(index == 3){ r = p; g = q; b = v; } else if(index == 4){ r = t; g = p; b = v; } else { r = v; g = p; b = q; } } r = (r < 0) ? 0 : ((r > 1) ? 1 : r); g = (g < 0) ? 0 : ((g > 1) ? 1 : g); b = (b < 0) ? 0 : ((b > 1) ? 1 : b); return (float4)(r, g, b, 0.0); } attribute((noinline)) float bilinear_interpolate_kernel(__global float image, int w, int h, float x, float y, int c) { int ix = (int) floor(x); int iy = (int) floor(y); float dx = x - ix; float dy = y - iy; float val1 = ((1-dy) (1-dx) get_pixel_kernel(image, w, h, ix, iy, c)); float val2 = (dy (1-dx) get_pixel_kernel(image, w, h, ix, iy+1, c)); float val3 = ((1-dy) dx get_pixel_kernel(image, w, h, ix+1, iy, c)); float val4 = (dy dx get_pixel_kernel(image, w, h, ix+1, iy+1, c)); float val = val1 + val2 + val3 + val4; return val; } kernel void levels_image_kernel(global float image, __global float rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift) { int size = batch w h; int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(id >= size) return; int x = id % w; id /= w; int y = id % h; id /= h; float rshift = rand[0]; float gshift = rand[1]; float bshift = rand[2]; float r0 = rand[8id + 0]; float r1 = rand[8id + 1]; float r2 = rand[8id + 2]; float r3 = rand[8id + 3]; saturation = r0(saturation - 1) + 1; saturation = (r1 > .5) ? 1./saturation : saturation; exposure = r2(exposure - 1) + 1; exposure = (r3 > .5) ? 1./exposure : exposure; size_t offset = id h w 3; image += offset; float r = image[x + w(y + h0)]; float g = image[x + w(y + h1)]; float b = image[x + w(y + h2)]; float4 rgb = (float4)(r,g,b, 0.0); if(train){ float4 hsv = rgb_to_hsv_kernel(rgb); hsv.y = saturation; hsv.z = exposure; rgb = hsv_to_rgb_kernel(hsv); } else { shift = 0; } image[x + w(y + h0)] = rgb.xscale + translate + (rshift - .5)shift; image[x + w(y + h1)] = rgb.yscale + translate + (gshift - .5)shift; image[x + w(y + h2)] = rgb.zscale + translate + (bshift - .5)shift; } __kernel void forward_crop_layer_kernel(__global float input, global float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, global float output) { int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(id >= size) return; float cx = w/2.; float cy = h/2.; int count = id; int j = id % crop_width; id /= crop_width; int i = id % crop_height; id /= crop_height; int k = id % c; id /= c; int b = id; float r4 = rand[8b + 4]; float r5 = rand[8b + 5]; float r6 = rand[8b + 6]; float r7 = rand[8b + 7]; float dw = (w - crop_width)r4; float dh = (h - crop_height)r5; flip = (flip && (r6 > .5)); angle = 2angler7 - angle; if(!train){ dw = (w - crop_width)/2.; dh = (h - crop_height)/2.; flip = 0; angle = 0; } input += whcb; float x = (flip!=0) ? w - dw - j - 1 : j + dw; float y = i + dh; float rx = cos(angle)(x-cx) - sin(angle)(y-cy) + cx; float ry = sin(angle)(x-cx) + cos(angle)(y-cy) + cy; output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k); }

terminate called without an active exception Aborted

EDIT: attribute is an __attribute__

Can you help? in CODE section above is the code I am trying to compile/build. Thanks!

doe300 commented 6 years ago

The error message says the compiler is unable to map all locals to registers. This is probably due to the fact that the kernel is to complex for the current implementation of register allocation to handle.

A side note: the attribute noinline is very useless on OpenCL functions, since VC4CL (as well as AMD, see here) inlines all functions, and I'm not sure it is even officially supported.

sowson commented 6 years ago

I will try to make sure that functions have only one return. Maybe that will help?

Instead of: float get_pixel_kernel(__global float *image, int w, int h, int x, int y, int c) { if(x < 0 || x >= w || y < 0 || y >= h) return 0; return image[x + w*(y + c*h)]; }

I will use: float get_pixel_kernel(__global float *image, int w, int h, int x, int y, int c) { return (x < 0 || x >= w || y < 0 || y >= h) ? 0 : image[x + w*(y + c*h)]; }

I am testing this right now, I will update this issue once I will make sure it works.

Thanks a lot!

sowson commented 6 years ago

Unfortunately, it is still the issue. On mac and pc, it works just fine. The question is how to reduce complexity and/or make it compile correctly?

root@raspberrypi:~/cifar# ../darknet/darknet classifier predict cfg/cifar.data cfg/cifar_small_test.cfg ../weights/cifar_small.weights data/cifar/test/4882_frog.png Device ID: 0 Device name: VideoCore IV GPU Device vendor: Broadcom Device opencl availability: OpenCL 1.2 VC4CL 0.4 Device opencl used: 0.4 Device double precision: NO Device max group size: 12 Device address bits: 32 opencl_load: could not compile. error: CL_UNKNOWN_ERROR CL_PROGRAM_BUILD_LOG: [W] Thu Aug 9 09:54:27 2018: Register conflict resolver has exceeded its maximum rounds, there might still be errors! [E] Thu Aug 9 09:54:27 2018: Error assigning local to register: %g.1.i [E] Thu Aug 9 09:54:27 2018: (1) /usr/local/lib/libVC4CC.so.1.2 : vc4c::CompilationError::CompilationError(vc4c::CompilationStep, std::cxx11::basic_string<char, std::char_traits, std::allocator > const&)+0xac [0x768f0918] [E] Thu Aug 9 09:54:27 2018: (2) /usr/local/lib/libVC4CC.so.1.2 : vc4c::qpu_asm::GraphColoring::toRegisterMap() const+0x150 [0x769de3b8] [E] Thu Aug 9 09:54:27 2018: (3) /usr/local/lib/libVC4CC.so.1.2 : vc4c::qpu_asm::CodeGenerator::generateInstructions(vc4c::Method&)+0x474 [0x769cd900] [E] Thu Aug 9 09:54:27 2018: (4) /usr/local/lib/libVC4CC.so.1.2 : +0x4983ac [0x768f23ac] [E] Thu Aug 9 09:54:27 2018: (5) /usr/local/lib/libVC4CC.so.1.2 : +0x498588 [0x768f2588] [E] Thu Aug 9 09:54:27 2018: (6) /usr/local/lib/libVC4CC.so.1.2 : +0x49942c [0x768f342c] [E] Thu Aug 9 09:54:27 2018: (7) /usr/local/lib/libVC4CC.so.1.2 : std::function<void (vc4c::Method const&)>::operator()(vc4c::Method const&) const+0x54 [0x768f7214] [E] Thu Aug 9 09:54:27 2018: (8) /usr/local/lib/libVC4CC.so.1.2 : void vc4c::BackgroundWorker::scheduleAll<vc4c::Method, std::vector<vc4c::Method, std::allocator<vc4c::Method> > >(std::vector<vc4c::Method, std::allocator<vc4c::Method> > const&, std::function<void (vc4c::Method const&)> const&, std::cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda()#1}::operator()() const+0xc8 [0x768f5a1c] [E] Thu Aug 9 09:54:27 2018: (9) /usr/local/lib/libVC4CC.so.1.2 : std::_Function_handler<void (), void vc4c::BackgroundWorker::scheduleAll<vc4c::Method, std::vector<vc4c::Method, std::allocator<vc4c::Method> > >(std::vector<vc4c::Method, std::allocator<vc4c::Method> > const&, std::function<void (vc4c::Method const&)> const&, std::cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda()#1}>::_M_invoke(std::_Any_data const&)+0x24 [0x768fa8b8] [E] Thu Aug 9 09:54:27 2018: (10) /usr/local/lib/libVC4CC.so.1.2 : std::function<void ()>::operator()() const+0x40 [0x768e18e4] [E] Thu Aug 9 09:54:27 2018: (11) /usr/local/lib/libVC4CC.so.1.2 : +0x486a70 [0x768e0a70] [E] Thu Aug 9 09:54:27 2018: (12) /usr/local/lib/libVC4CC.so.1.2 : +0x48735c [0x768e135c] [E] Thu Aug 9 09:54:27 2018: (13) /usr/local/lib/libVC4CC.so.1.2 : +0x4872c8 [0x768e12c8] [E] Thu Aug 9 09:54:27 2018: (14) /usr/local/lib/libVC4CC.so.1.2 : +0x4872a0 [0x768e12a0] [E] Thu Aug 9 09:54:27 2018: (15) /usr/lib/arm-linux-gnueabihf/libstdc++.so.6 : +0x9c9dc [0x763709dc] [E] Thu Aug 9 09:54:27 2018: Background worker threw error: Label/Register Mapping: There are erroneous register-associations! [E] Thu Aug 9 09:54:27 2018: While running worker task: CodeGenerator [E] Thu Aug 9 09:54:27 2018: Compiler threw exception: Label/Register Mapping: There are erroneous register-associations! CODE: float get_pixel_kernel(global float image, int w, int h, int x, int y, int c); float4 rgb_to_hsv_kernel(float4 rgb); float4 hsv_to_rgb_kernel(float4 hsv); float bilinear_interpolate_kernel(__global float image, int w, int h, float x, float y, int c); float get_pixel_kernel(global float image, int w, int h, int x, int y, int c) { return (x < 0 || x >= w || y < 0 || y >= h) ? 0 : image[x + w(y + ch)]; } float4 rgb_to_hsv_kernel(float4 rgb) { float r = rgb.x; float g = rgb.y; float b = rgb.z; float h, s, v; float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b); float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b); float delta = max - min; v = max; if(max == 0){ s = 0; h = -1; }else{ s = delta/max; if(r == max){ h = (g - b) / delta; } else if (g == max) { h = 2 + (b - r) / delta; } else { h = 4 + (r - g) / delta; } if (h < 0) h += 6; } return (float4) (h, s, v, 0); } float4 hsv_to_rgb_kernel(float4 hsv) { float h = hsv.x; float s = hsv.y; float v = hsv.z; float r, g, b; float f, p, q, t; if (s == 0) { r = g = b = v; } else { int index = (int) floor(h); f = h - index; p = v(1-s); q = v(1-sf); t = v(1-s(1-f)); if(index == 0){ r = v; g = t; b = p; } else if(index == 1){ r = q; g = v; b = p; } else if(index == 2){ r = p; g = v; b = t; } else if(index == 3){ r = p; g = q; b = v; } else if(index == 4){ r = t; g = p; b = v; } else { r = v; g = p; b = q; } } r = (r < 0) ? 0 : ((r > 1) ? 1 : r); g = (g < 0) ? 0 : ((g > 1) ? 1 : g); b = (b < 0) ? 0 : ((b > 1) ? 1 : b); return (float4)(r, g, b, 0.0); } float bilinear_interpolate_kernel(global float image, int w, int h, float x, float y, int c) { int ix = (int) floor(x); int iy = (int) floor(y); float dx = x - ix; float dy = y - iy; float val1 = ((1-dy) (1-dx) get_pixel_kernel(image, w, h, ix, iy, c)); float val2 = (dy (1-dx) get_pixel_kernel(image, w, h, ix, iy+1, c)); float val3 = ((1-dy) dx get_pixel_kernel(image, w, h, ix+1, iy, c)); float val4 = (dy dx get_pixel_kernel(image, w, h, ix+1, iy+1, c)); float val = val1 + val2 + val3 + val4; return val; } kernel void levels_image_kernel(global float image, global float rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift) { int size = batch w h; int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(id >= size) return; int x = id % w; id /= w; int y = id % h; id /= h; float rshift = rand[0]; float gshift = rand[1]; float bshift = rand[2]; float r0 = rand[8id + 0]; float r1 = rand[8id + 1]; float r2 = rand[8id + 2]; float r3 = rand[8id + 3]; saturation = r0(saturation - 1) + 1; saturation = (r1 > .5) ? 1./saturation : saturation; exposure = r2(exposure - 1) + 1; exposure = (r3 > .5) ? 1./exposure : exposure; size_t offset = id h w 3; image += offset; float r = image[x + w(y + h0)]; float g = image[x + w(y + h1)]; float b = image[x + w(y + h2)]; float4 rgb = (float4)(r,g,b, 0.0); if(train){ float4 hsv = rgb_to_hsv_kernel(rgb); hsv.y = saturation; hsv.z = exposure; rgb = hsv_to_rgb_kernel(hsv); } else { shift = 0; } image[x + w(y + h0)] = rgb.xscale + translate + (rshift - .5)shift; image[x + w(y + h1)] = rgb.yscale + translate + (gshift - .5)shift; image[x + w(y + h2)] = rgb.zscale + translate + (bshift - .5)shift; } __kernel void forward_crop_layer_kernel(global float input, __global float rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, __global float output) { int id = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0); if(id >= size) return; float cx = w/2.; float cy = h/2.; int count = id; int j = id % crop_width; id /= crop_width; int i = id % crop_height; id /= crop_height; int k = id % c; id /= c; int b = id; float r4 = rand[8b + 4]; float r5 = rand[8b + 5]; float r6 = rand[8b + 6]; float r7 = rand[8b + 7]; float dw = (w - crop_width)r4; float dh = (h - crop_height)r5; flip = (flip && (r6 > .5)); angle = 2angler7 - angle; if(!train){ dw = (w - crop_width)/2.; dh = (h - crop_height)/2.; flip = 0; angle = 0; } input += whcb; float x = (flip!=0) ? w - dw - j - 1 : j + dw; float y = i + dh; float rx = cos(angle)(x-cx) - sin(angle)(y-cy) + cx; float ry = sin(angle)(x-cx) + cos(angle)(y-cy) + cy; output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k); } terminate called without an active exception Aborted

sowson commented 6 years ago

From time to time it works... like 1/10 tries and that is strange to me. It is quite fast but is wrong on mac or pc the detection is different and a frog is correctly classified. Thanks a lot for your work, I will be working on that maybe I found how to reduce the complexity of the code?

root@raspberrypi:~/cifar# ../darknet/darknet classifier predict cfg/cifar.data cfg/cifar_small_test.cfg ../weights/cifar_small.weights data/cifar/test/4882_frog.png Device ID: 0 Device name: VideoCore IV GPU Device vendor: Broadcom Device opencl availability: OpenCL 1.2 VC4CL 0.4 Device opencl used: 0.4 Device double precision: NO Device max group size: 12 Device address bits: 32 layer filters size input output 0 conv 32 3 x 3 / 1 28 x 28 x 3 -> 28 x 28 x 32 0.001 BFLOPs 1 max 2 x 2 / 2 28 x 28 x 32 -> 14 x 14 x 32 2 conv 16 1 x 1 / 1 14 x 14 x 32 -> 14 x 14 x 16 0.000 BFLOPs 3 conv 64 3 x 3 / 1 14 x 14 x 16 -> 14 x 14 x 64 0.004 BFLOPs 4 max 2 x 2 / 2 14 x 14 x 64 -> 7 x 7 x 64 5 conv 32 1 x 1 / 1 7 x 7 x 64 -> 7 x 7 x 32 0.000 BFLOPs 6 conv 128 3 x 3 / 1 7 x 7 x 32 -> 7 x 7 x 128 0.004 BFLOPs 7 conv 64 1 x 1 / 1 7 x 7 x 128 -> 7 x 7 x 64 0.001 BFLOPs 8 conv 10 1 x 1 / 1 7 x 7 x 64 -> 7 x 7 x 10 0.000 BFLOPs 9 avg 7 x 7 x 10 -> 10 10 softmax 10 Loading weights from ../weights/cifar_small.weights...Done! data/cifar/test/4882_frog.png: Predicted in 1.647542 seconds. 16.96%: dog 16.44%: deer

doe300 commented 6 years ago

From time to time it works... like 1/10 tries and that is strange to me

Yeah, the register allocation is not deterministic... :) More precisely, the order locals are mapped to registers is indeterministic. And it seems the complexity of your code is in a gray area, where some orders allow all locals to be assigned and some don't.

It is quite fast but is wrong

That is to be expected, the code generated is not yet necessarily completely correct.

sowson commented 6 years ago

I hope below helps you ;-). Thank you and keep going on this!

Device ID: 0 Device name: VideoCore IV GPU Device vendor: Broadcom Device opencl availability: OpenCL 1.2 VC4CL 0.4 Device opencl used: 0.4 Device double precision: NO Device max group size: 12 Device address bits: 32

TEST CPU: sqrt(2.0000000000000000000000) = 1.4142135381698608398438 PASS log(1.4142135381698608398438) = 0.3465735614299774169922 PASS pow(0.3465735614299774169922, 1.4142135381698608398438) = 0.2234459370374679565430 PASS exp(0.2234459370374679565430) = -1.2503780126571655273438 PASS fabs(-1.2503780126571655273438) = 1.2503780126571655273438 PASS sin(1.2503780126571655273438) = 0.9491037130355834960938 FAIL cos(0.9491037130355834960938) = 0.5824118852615356445312 PASS

TEST GPU: sqrt(2.0000000000000000000000) = 1.4142129421234130859375 FAIL log(1.4142129421234130859375) = 0.6493065357208251953125 FAIL pow(0.6493065357208251953125, 1.4142129421234130859375) = 0.8553451895713806152344 FAIL exp(0.8553451895713806152344) = -2.3432912826538085937500 FAIL fabs(-2.3432912826538085937500) = 2.3432912826538085937500 FAIL sin(2.3432912826538085937500) = 2.8538129329681396484375 FAIL cos(2.8538129329681396484375) = 19651868.0000000000000000000000 FAIL

doe300 commented 6 years ago

sqrt(2.0000000000000000000000) = 1.4142129421234130859375 FAIL

This one looks like sqrt is not exact enough (off by 6 ULP, allowed are 4 ULP). log, pow and exp are not correctly implemented yet.

fabs(-2.3432912826538085937500) = 2.3432912826538085937500 FAIL

This makes no sense, since the result is correct...

sowson commented 6 years ago

Tests are feed-forward one, the output of each is given to the input to the next. N = 1.

__kernel void test_kernel(int N, global float *input, global float output, __global float expected) { int index = (get_group_id(0) + get_group_id(1)get_num_groups(0)) get_local_size(0) + get_local_id(0);

if (index >= N) return;

output[index] = sqrt(input[index]);

index += 1;
input[index] = output[index-1];
output[index] = log(input[index]);

index += 1;
input[index] = output[index-1];
output[index] = pow(input[index], output[index-2]);

index += 1;
input[index] = output[index-1];
output[index] = -exp(input[index]);

index += 1;
input[index] = output[index-1];
output[index] = fabs(input[index]);

index += 1;
input[index] = output[index-1];
output[index] = sin(input[index]);

index += 1;
input[index] = output[index-1];
output[index] = cos(input[index]);

}

Thanks!

pengyuan-zhou commented 6 years ago

Hi I am working on RPI support on the fork of DarkNet at https://github.com/sowson/darknet in Makefile I enabled RPI=1 and disabled OPENCV=0 after installation of VC4CL and make the project I am trying to run it but without success. One of OpenCL program did not compile in runtime others are fine.

root@raspberrypi:~/cifar# ../darknet/darknet classifier predict cfg/cifar.data cfg/cifar_small_test.cfg backup/cifar_small.backup data/cifar/test/4882_frog.png Device ID: 0 Device name: VideoCore IV GPU Device vendor: Broadcom Device opencl availability: OpenCL 1.2 VC4CL 0.4 Device opencl used: 0.4 Device double precision: NO Device max group size: 12 Device address bits: 32 opencl_load: could not compile. error: CL_UNKNOWN_ERROR CL_PROGRAM_BUILD_LOG: [W] Thu Aug 9 08:06:40 2018: Register conflict resolver has exceeded its maximum rounds, there might still be errors! [E] Thu Aug 9 08:06:40 2018: Error assigning local to register: %call59.%b.1 [E] Thu Aug 9 08:06:40 2018: (1) /usr/local/lib/libVC4CC.so.1.2 : vc4c::CompilationError::CompilationError(vc4c::CompilationStep, std::cxx11::basic_string<char, std::char_traits, std::allocator > const&)+0xac [0x7695f918] [E] Thu Aug 9 08:06:40 2018: (2) /usr/local/lib/libVC4CC.so.1.2 : vc4c::qpu_asm::GraphColoring::toRegisterMap() const+0x150 [0x76a4d3b8] [E] Thu Aug 9 08:06:40 2018: (3) /usr/local/lib/libVC4CC.so.1.2 : vc4c::qpu_asm::CodeGenerator::generateInstructions(vc4c::Method&)+0x474 [0x76a3c900] [E] Thu Aug 9 08:06:40 2018: (4) /usr/local/lib/libVC4CC.so.1.2 : +0x4983ac [0x769613ac] [E] Thu Aug 9 08:06:40 2018: (5) /usr/local/lib/libVC4CC.so.1.2 : +0x498588 [0x76961588] [E] Thu Aug 9 08:06:40 2018: (6) /usr/local/lib/libVC4CC.so.1.2 : +0x49942c [0x7696242c] [E] Thu Aug 9 08:06:40 2018: (7) /usr/local/lib/libVC4CC.so.1.2 : std::function<void (vc4c::Method const&)>::operator()(vc4c::Method const&) const+0x54 [0x76966214] [E] Thu Aug 9 08:06:40 2018: (8) /usr/local/lib/libVC4CC.so.1.2 : void vc4c::BackgroundWorker::scheduleAll<vc4c::Method, std::vector<vc4c::Method, std::allocatorvc4c::Method > >(std::vector<vc4c::Method, std::allocatorvc4c::Method > const&, std::function<void (vc4c::Method const&)> const&, std::cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda()#1}::operator()() const+0xc8 [0x76964a1c] [E] Thu Aug 9 08:06:40 2018: (9) /usr/local/lib/libVC4CC.so.1.2 : std::_Function_handler<void (), void vc4c::BackgroundWorker::scheduleAll<vc4c::Method, std::vector<vc4c::Method, std::allocatorvc4c::Method > >(std::vector<vc4c::Method, std::allocatorvc4c::Method > const&, std::function<void (vc4c::Method const&)> const&, std::__cxx11::basic_string<char, std::char_traits, std::allocator >)::{lambda()#1}>::_M_invoke(std::_Any_data const&)+0x24 [0x769698b8] [E] Thu Aug 9 08:06:40 2018: (10) /usr/local/lib/libVC4CC.so.1.2 : std::function<void ()>::operator()() const+0x40 [0x769508e4] [E] Thu Aug 9 08:06:40 2018: (11) /usr/local/lib/libVC4CC.so.1.2 : +0x486a70 [0x7694fa70] [E] Thu Aug 9 08:06:40 2018: (12) /usr/local/lib/libVC4CC.so.1.2 : +0x48735c [0x7695035c] [E] Thu Aug 9 08:06:40 2018: (13) /usr/local/lib/libVC4CC.so.1.2 : +0x4872c8 [0x769502c8] [E] Thu Aug 9 08:06:40 2018: (14) /usr/local/lib/libVC4CC.so.1.2 : +0x4872a0 [0x769502a0] [E] Thu Aug 9 08:06:40 2018: (15) /usr/lib/arm-linux-gnueabihf/libstdc++.so.6 : +0x9c9dc [0x763df9dc] [E] Thu Aug 9 08:06:40 2018: Background worker threw error: Label/Register Mapping: There are erroneous register-associations! [E] Thu Aug 9 08:06:40 2018: While running worker task: CodeGenerator [E] Thu Aug 9 08:06:40 2018: Compiler threw exception: Label/Register Mapping: There are erroneous register-associations!

CODE: attribute((noinline))` float get_pixel_kernel(global float _image, int w, int h, int x, int y, int c); attribute((noinline)) float4 rgb_to_hsv_kernel(float4 rgb); attribute((noinline)) float4 hsv_to_rgb_kernel(float4 hsv); attribute((noinline)) float bilinear_interpolate_kernel(__global float _image, int w, int h, float x, float y, int c); attribute((noinline)) float get_pixel_kernel(global float image, int w, int h, int x, int y, int c) { if(x < 0 || x >= w || y < 0 || y >= h) return 0; return image[x + w(y + c_h)]; } attribute((noinline)) float4 rgb_to_hsv_kernel(float4 rgb) { float r = rgb.x; float g = rgb.y; float b = rgb.z; float h, s, v; float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b); float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b); float delta = max - min; v = max; if(max == 0){ s = 0; h = -1; }else{ s = delta/max; if(r == max){ h = (g - b) / delta; } else if (g == max) { h = 2 + (b - r) / delta; } else { h = 4 + (r - g) / delta; } if (h < 0) h += 6; } return (float4) (h, s, v, 0.0); } attribute((noinline)) float4 hsv_to_rgbkernel(float4 hsv) { float h = hsv.x; float s = hsv.y; float v = hsv.z; float r, g, b; float f, p, q, t; if (s == 0) { r = g = b = v; } else { int index = (int) floor(h); f = h - index; p = v(1-s); q = v(1-sf); t = v(1-s(1-f)); if(index == 0){ r = v; g = t; b = p; } else if(index == 1){ r = q; g = v; b = p; } else if(index == 2){ r = p; g = v; b = t; } else if(index == 3){ r = p; g = q; b = v; } else if(index == 4){ r = t; g = p; b = v; } else { r = v; g = p; b = q; } } r = (r < 0) ? 0 : ((r > 1) ? 1 : r); g = (g < 0) ? 0 : ((g > 1) ? 1 : g); b = (b < 0) ? 0 : ((b > 1) ? 1 : b); return (float4)(r, g, b, 0.0); } attribute((noinline)) float bilinear_interpolate_kernel(global float _image, int w, int h, float x, float y, int c) { int ix = (int) floor(x); int iy = (int) floor(y); float dx = x - ix; float dy = y - iy; float val1 = ((1-dy) (1-dx) get_pixel_kernel(image, w, h, ix, iy, c)); float val2 = (dy (1-dx) get_pixel_kernel(image, w, h, ix, iy+1, c)); float val3 = ((1-dy) dx get_pixel_kernel(image, w, h, ix+1, iy, c)); float val4 = (dy dx get_pixel_kernel(image, w, h, ix+1, iy+1, c)); float val = val1 + val2 + val3 + val4; return val; } kernel void levels_image_kernel(global float _image, global float _rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift) { int size = batch w h; int id = (get_group_id(0) + get_group_id(1)_get_num_groups(0)) get_local_size(0) + get_local_id(0); if(id >= size) return; int x = id % w; id /= w; int y = id % h; id /= h; float rshift = rand[0]; float gshift = rand[1]; float bshift = rand[2]; float r0 = rand[8_id + 0]; float r1 = rand[8_id + 1]; float r2 = rand[8_id + 2]; float r3 = rand[8_id + 3]; saturation = r0(saturation - 1) + 1; saturation = (r1 > .5) ? 1./saturation : saturation; exposure = r2(exposure - 1) + 1; exposure = (r3 > .5) ? 1./exposure : exposure; size_t offset = id h w 3; image += offset; float r = image[x + w(y + h0)]; float g = image[x + w(y + h1)]; float b = image[x + w(y + h_2)]; float4 rgb = (float4)(r,g,b, 0.0); if(train){ float4 hsv = rgb_to_hsvkernel(rgb); hsv.y = saturation; hsv.z _= exposure; rgb = hsv_to_rgbkernel(hsv); } else { shift = 0; } image[x + w(y + h_0)] = rgb.x_scale + translate + (rshift - .5)shift; image[x + w(y + h_1)] = rgb.y_scale + translate + (gshift - .5)shift; image[x + w(y + h_2)] = rgb.z_scale + translate + (bshift - .5)_shift; } kernel void forward_crop_layer_kernel(__global float _input, global float _rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, __global float _output) { int id = (get_group_id(0) + get_group_id(1)_get_num_groups(0)) get_local_size(0) + get_local_id(0); if(id >= size) return; float cx = w/2.; float cy = h/2.; int count = id; int j = id % crop_width; id /= crop_width; int i = id % crop_height; id /= crop_height; int k = id % c; id /= c; int b = id; float r4 = rand[8_b + 4]; float r5 = rand[8_b + 5]; float r6 = rand[8_b + 6]; float r7 = rand[8_b + 7]; float dw = (w - crop_width)_r4; float dh = (h - crop_height)_r5; flip = (flip && (r6 > .5)); angle = 2_angle_r7 - angle; if(!train){ dw = (w - crop_width)/2.; dh = (h - crop_height)/2.; flip = 0; angle = 0; } input += w_h_cb; float x = (flip!=0) ? w - dw - j - 1 : j + dw; float y = i + dh; float rx = cos(angle)(x-cx) - sin(angle)(y-cy) + cx; float ry = sin(angle)(x-cx) + cos(angle)_(y-cy) + cy; output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k); }

terminate called without an active exception Aborted

EDIT: attribute is an __attribute__

Can you help? in CODE section above is the code I am trying to compile/build. Thanks!

Hi, Thanks for your great code. However I'm meeting with the same issue you met here. How did you solve it? PS. I'm running on Mac 10.11, and "brew install clrng" doesn't work so I manually built it. Not sure if that's the problem.

Thank you.

zhaodongmcw commented 5 years ago

Did you ever encounter an issue when calling function clBuildProgram on runtime to build .cl file on the Raspberry PI? I found the error when calling function clBuildProgram on runtime. It prompts the following information.

symbol lookup error: /usr/local/lib/libVC4CL.so: undefined symbol: _ZN4vc4c11Precompiler10precompileERSiRSt10unique_ptrISiSt14default_deleteISiEENS_13ConfigurationERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKNS_8OptionalISDEESH

However I find the symbol in libVC4CC.so and libVC4CL.so.

pi@raspberrypi:~ $ nm /usr/local/lib/libVC4CL.so | grep _ZN4vc4c11Precompiler10precompileERSiRSt10unique

     U _ZN4vc4c11Precompiler10precompileERSiRSt10unique_ptrISiSt14default_deleteISiEENS_13ConfigurationERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKNS_8OptionalISD_EESH_

pi@raspberrypi:~ $ nm /usr/local/lib/libVC4CC.so | grep _ZN4vc4c11Precompiler10precompileERSiRSt10unique

00842780 T _ZN4vc4c11Precompiler10precompileERSiRSt10unique_ptrISiSt14default_deleteISiEENS_13ConfigurationERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEERKNS_8OptionalISDEESJ

Could you help me? Thank you.