Closed TaisZ closed 9 months ago
Thank you @TaisZ for the report. Which GPU are you using? It seems there is a bug for the second generated kernel by TornadoVM for reductions, but it works fine on the GPUs up to RTX 2060.
Thank you @TaisZ for the report. Which GPU are you using? It seems there is a bug for the second generated kernel by TornadoVM for reductions, but it works fine on the GPUs up to RTX 2060.
RTX 3080
Ok. Can you please report the generated kernels? Let's focus on the OpenCL for now. You can visualize the kernel with the --printKernel
option from the tornado
command.
The generated kernels of OpenCL are as follows:
$ tornado --printKernel -m tornado.examples/uk.ac.manchester.tornado.examples.reductions.PiComputation
WARNING: Using incubator modules: jdk.incubator.vector, jdk.incubator.foreign
Size = 8192 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void computePi(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *input, __global uchar *result)
{
int i_33, i_4, i_6, i_5, i_40, i_7, i_39, i_44, i_43, i_14, i_13, i_18, i_49, i_22, i_21, i_26, i_28, i_27, i_30, i_29;
long l_17, l_16, l_35, l_34, l_45, l_15, l_47, l_46, l_9, l_8, l_10, l_36;
bool z_31, z_41;
ulong ul_1, ul_3, ul_0, ul_48, ul_11;
float f_32, f_12, f_25, f_42, f_23, f_24, f_37, f_38, f_19, f_20;
// BLOCK 0
ul_0 = (ulong) input;
ul_1 = (ulong) result;
__private float ul_2[916];
ul_3 = ul_1 + 24L;
*((__global float *) ul_3) = 0.0F;
i_4 = get_global_size(0);
i_5 = get_global_id(0);
i_6 = i_5 + 1;
// BLOCK 1 MERGES [0 11 ]
i_7 = i_6;
for(;i_7 < 8192;)
{
// BLOCK 2
l_8 = (long) i_7;
l_9 = l_8 << 2;
l_10 = l_9 + 24L;
ul_11 = ul_0 + l_10;
f_12 = *((__global float *) ul_11);
i_13 = get_local_id(0);
i_14 = get_local_size(0);
l_15 = (long) i_13;
l_16 = l_15 << 2;
l_17 = l_16 + 24L;
i_18 = i_7 + 1;
f_19 = (float) i_18;
f_20 = pow(-1.0F, f_19);
i_21 = i_7 << 1;
i_22 = i_21 + -1;
f_23 = (float) i_22;
f_24 = f_20 / f_23;
f_25 = f_24 + f_12;
ul_2[l_17] = f_25;
i_26 = i_14 >> 31;
i_27 = i_26 + i_14;
i_28 = i_27 >> 1;
// BLOCK 3 MERGES [2 7 ]
i_29 = i_28;
for(;i_29 >= 1;)
{
// BLOCK 4
barrier(CLK_LOCAL_MEM_FENCE);
i_30 = i_29 >> 1;
z_31 = i_13 < i_29;
if(z_31)
{
// BLOCK 5
f_32 = ul_2[l_17];
i_33 = i_29 + i_13;
l_34 = (long) i_33;
l_35 = l_34 << 2;
l_36 = l_35 + 24L;
f_37 = ul_2[l_36];
f_38 = f_32 + f_37;
ul_2[l_17] = f_38;
} // B5
else
{
// BLOCK 6
} // B6
// BLOCK 7 MERGES [6 5 ]
i_39 = i_30;
i_29 = i_39;
} // B7
// BLOCK 8
barrier(CLK_GLOBAL_MEM_FENCE);
i_40 = i_4 + i_7;
z_41 = i_13 == 0;
if(z_41)
{
// BLOCK 9
f_42 = ul_2[24L];
i_43 = get_group_id(0);
i_44 = i_43 + 1;
l_45 = (long) i_44;
l_46 = l_45 << 2;
l_47 = l_46 + 24L;
ul_48 = ul_1 + l_47;
*((__global float *) ul_48) = f_42;
} // B9
else
{
// BLOCK 10
} // B10
// BLOCK 11 MERGES [10 9 ]
i_49 = i_40;
i_7 = i_49;
} // B11
// BLOCK 12
return;
} // kernel
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void rAdd(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *array, __private int size)
{
ulong ul_5, ul_7, ul_1, ul_17, ul_3, ul_13, ul_0, ul_15, ul_9, ul_11;
float f_25, f_26, f_8, f_6, f_4, f_2, f_16, f_14, f_12, f_10, f_23, f_24, f_21, f_22, f_19, f_20, f_18;
// BLOCK 0
ul_0 = (ulong) array;
ul_1 = ul_0 + 24L;
f_2 = *((__global float *) ul_1);
ul_3 = ul_0 + 28L;
f_4 = *((__global float *) ul_3);
ul_5 = ul_0 + 32L;
f_6 = *((__global float *) ul_5);
ul_7 = ul_0 + 36L;
f_8 = *((__global float *) ul_7);
ul_9 = ul_0 + 40L;
f_10 = *((__global float *) ul_9);
ul_11 = ul_0 + 44L;
f_12 = *((__global float *) ul_11);
ul_13 = ul_0 + 48L;
f_14 = *((__global float *) ul_13);
ul_15 = ul_0 + 52L;
f_16 = *((__global float *) ul_15);
ul_17 = ul_0 + 56L;
f_18 = *((__global float *) ul_17);
f_19 = f_2 + f_4;
f_20 = f_19 + f_6;
f_21 = f_20 + f_8;
f_22 = f_21 + f_10;
f_23 = f_22 + f_12;
f_24 = f_23 + f_14;
f_25 = f_24 + f_16;
f_26 = f_25 + f_18;
*((__global float *) ul_1) = f_26;
return;
} // kernel
[TornadoVM-OCL-JNI] ERROR : clBuildProgram -> Returned: -9999
Error during code compilation with the OpenCL driver
[Bailout] Running the sequential implementation. Enable --debug to see the reason.
[TornadoVM-OCL-JNI] ERROR : clFinish -> Returned: -36
PI VALUE: 3.1417189
Exception in thread "main" uk.ac.manchester.tornado.api.exceptions.TornadoRuntimeException: [ERROR] TornadoVM Bytecode not recognized
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.TornadoVM.throwError(TornadoVM.java:667)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.TornadoVM.execute(TornadoVM.java:782)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.TornadoVM.execute(TornadoVM.java:217)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.scheduleInner(TornadoTaskGraph.java:798)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.schedule(TornadoTaskGraph.java:1213)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.TaskGraph.execute(TaskGraph.java:782)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.ImmutableTaskGraph.execute(ImmutableTaskGraph.java:73)
at java.base/java.util.ArrayList.forEach(ArrayList.java:1511)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.TornadoExecutionPlan$TornadoExecutor.execute(TornadoExecutionPlan.java:307)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.TornadoExecutionPlan.execute(TornadoExecutionPlan.java:126)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.ReduceTaskGraph.executeExpression(ReduceTaskGraph.java:647)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.runReduceTaskGraph(TornadoTaskGraph.java:1084)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.analyzeSkeletonAndRun(TornadoTaskGraph.java:1111)
at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.schedule(TornadoTaskGraph.java:1194)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.TaskGraph.execute(TaskGraph.java:782)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.ImmutableTaskGraph.execute(ImmutableTaskGraph.java:73)
at java.base/java.util.ArrayList.forEach(ArrayList.java:1511)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.TornadoExecutionPlan$TornadoExecutor.execute(TornadoExecutionPlan.java:307)
at tornado.api@0.15.1-dev/uk.ac.manchester.tornado.api.TornadoExecutionPlan.execute(TornadoExecutionPlan.java:126)
at tornado.examples@0.15.1-dev/uk.ac.manchester.tornado.examples.reductions.PiComputation.run(PiComputation.java:73)
at tornado.examples@0.15.1-dev/uk.ac.manchester.tornado.examples.reductions.PiComputation.main(PiComputation.java:90)
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> Unknown error executing clFlush on NVIDIA GeForce RTX 3080 (Device 0).
I could reproduce the error. We will work on it.
This issue has been solved: https://github.com/beehive-lab/TornadoVM/commit/5f14754aafb8239e9ef5fcd7925325b3b993d2c1
Describe the bug
TornadoVM gives a compilation failed error when running the PiComputation example. Both OpenCL and PTX backends can throw this error.
How To Reproduce
For OpenCL backend:
For PTX backend:
Output
For OpenCL backend:
For PTX backend:
Computing system setup (please complete the following information):