beehive-lab / TornadoVM

TornadoVM: A practical and efficient heterogeneous programming framework for managed languages
https://www.tornadovm.org
Apache License 2.0
1.18k stars 113 forks source link

Compilation error with a nested for and if blocks. #381

Open ShAlireza opened 5 months ago

ShAlireza commented 5 months ago

Describe the bug

The generated OpenCL code has a syntax error.

How To Reproduce

Following is my kernel in Java.

public static void scan(KernelContext context, IntArray input, IntArray sum) {
    int[] temp = context.allocateIntLocalArray(4 * 128);
    int gid2 = context.globalIdx << 1;
    int group = context.groupIdx;
    int item = context.localIdx;
    int n = context.localGroupSizeX << 1;

    temp[2 * item] = input.get(gid2);
    temp[2 * item + 1] = input.get(gid2 + 1);

    int decale = 1;

    for (int d = n >> 1; d > 0; d = d >> 1) {
        context.localBarrier();
        if (item < d) {
            int ai = decale * ((item << 1) + 1) - 1;
            int bi = decale * ((item << 1) + 2) - 1;
            temp[bi] += temp[ai];
        }
        decale = decale << 1;
    }

    if (item == 0) {
        sum.set(group, temp[n - 1]);
        temp[n - 1] = 0;
    }

    for (int d = 1; d < n; d = d << 1) {
        decale = decale >> 1;
        context.localBarrier();
        if (item < d) {
            int ai = decale * ((item << 1) + 1) - 1;
            int bi = decale * ((item << 1) + 2) - 1;
            int t = temp[ai];
            temp[ai] = temp[bi];
            temp[bi] += t;
        }
    }
    context.localBarrier();

    input.set(gid2, temp[item << 1]);
    input.set(gid2 + 1, temp[(item << 1) + 1]);
}

Expected behavior

Successfully compile the code to OpenCL kernel without any error.

Computing system setup (please complete the following information):

Additional context

This is the generated OpenCL code which is wrong.

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void scan(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *input, __global uchar *sum)
{
  ulong ul_8, ul_43, ul_1, ul_0, ul_15;
  long l_14, l_13, l_42, l_41, l_7, l_6;
  int i_25, i_27, i_28, i_29, i_30, i_31, i_32, i_17, i_18, i_19, i_20, i_21, i_22, i_23, i_24, i_9, i_10, i_11, i_12, i_16, i_3, i_4, i_5, i_57, i_58, i_59, i_60, i_49, i_50, i_51, i_52, i_53, i_54, i_55, i_56, i_44, i_45, i_46, i_47, i_33, i_34, i_35, i_37, i_38, i_39, i_40;
  bool b_36, b_26, b_48;

  // BLOCK 0
  ul_0  =  (ulong) input;
  ul_1  =  (ulong) sum;
  __local int adi_2[512];
  i_3  =  get_global_id(0);
  i_4  =  i_3 << 1;
  i_5  =  i_4 + 6;
  l_6  =  (long) i_5;
  l_7  =  l_6 << 2;
  ul_8  =  ul_0 + l_7;
  i_9  =  *((__global int *) ul_8);
  i_10  =  get_local_id(0);
  i_11  =  i_10 << 1;
  adi_2[i_11]  =  i_9;
  i_12  =  i_4 + 7;
  l_13  =  (long) i_12;
  l_14  =  l_13 << 2;
  ul_15  =  ul_0 + l_14;
  i_16  =  *((__global int *) ul_15);
  i_17  =  i_11 + 1;
  adi_2[i_17]  =  i_16;
  i_18  =  i_11 + 2;
  i_19  =  get_local_size(0);
  i_20  =  i_19 << 1;
  i_21  =  i_20 >> 1;
  // BLOCK 1 MERGES [0 5 ]
  i_22  =  1;
  i_23  =  i_21;
  for(;i_23 >= 1;)
  {
    // BLOCK 2
    barrier(CLK_LOCAL_MEM_FENCE);
    i_24  =  i_23 >> 1;
    i_25  =  i_22 << 1;
    b_26  =  i_10 < i_23;
    if(b_26)
    {
      // BLOCK 3
      i_27  =  i_18 * i_22;
      i_28  =  i_27 + -1;
      i_29  =  adi_2[i_28];
      i_30  =  i_22 * i_17;
      i_31  =  i_30 + -1;
      i_32  =  adi_2[i_31];
      i_33  =  i_29 + i_32;
      adi_2[i_28]  =  i_33;
    }  // B3
    else
    {
      // BLOCK 4
    }  // B4
    // BLOCK 5 MERGES [4 3 ]
    i_34  =  i_25;
    i_35  =  i_24;
    i_22  =  i_34;
    i_23  =  i_35;
  }  // B5
  // BLOCK 6
  b_36  =  i_10 == 0;
  if(b_36)
  {
    // BLOCK 7
    i_37  =  i_20 + -1;
    i_38  =  adi_2[i_37];
    i_39  =  get_group_id(0);
    i_40  =  i_39 + 6;
    l_41  =  (long) i_40;
    l_42  =  l_41 << 2;
    ul_43  =  ul_1 + l_42;
    *((__global int *) ul_43)  =  i_38;
    adi_2[i_37]  =  0;
  }  // B7
  else
  {
    // BLOCK 8
  }  // B8
  // BLOCK 9 MERGES [8 7 ]
  // BLOCK 10 MERGES [9 14 ]
  i_44  =  i_22;
  i_45  =  1;
  for(;i_45 < i_20;)
  {
    // BLOCK 11
    barrier(CLK_LOCAL_MEM_FENCE);
    i_46  =  i_45 << 1;
    i_47  =  i_44 >> 1;
    b_48  =  i_10 < i_45;
    if(b_48)
    {
      // BLOCK 12
      i_49  =  i_47 * i_17;
      i_50  =  i_49 + -1;
      i_51  =  adi_2[i_50];
      i_52  =  i_18 * i_47;
      i_53  =  i_52 + -1;
      i_54  =  adi_2[i_53];
      adi_2[i_50]  =  i_54;
      i_55  =  adi_2[i_53];
      i_56  =  i_51 + i_55;
      adi_2[i_53]  =  i_56;
    }  // B12
    else
    {
      // BLOCK 13
    }  // B13
    // BLOCK 14 MERGES [13 12 ]
    i_57  =  i_47;
    i_58  =  i_46;
    i_44  =  i_57;
    i_45  =  i_58;
  }  // B14
  // BLOCK 15
  barrier(CLK_LOCAL_MEM_FENCE);
  i_59  =  adi_2[i_11];
  *((__global int *) ul_8)  =  i_59;
  i_60  =  adi_2[i_17];
  *((__global int *) ul_15)  =  i_60;
  return;
}  // B15
}  //  kernel

jjfumero commented 5 months ago

Thank you for the report. We will work on this.

mikepapadim commented 5 months ago

@ShAlireza can you also provide the exact input sizes that you run the scan kernel? Thanks

ShAlireza commented 5 months ago

Sure. IntArray input = new IntArray(256 128 4); IntArray sum = new IntArray(256);

And here is my GridScheduler: int totalLocalScanItems = 256 128 4 / 2; int localItems = totalLocalScanItems / 256;

    WorkerGrid scanWorker = new WorkerGrid1D(totalLocalScanItems);
    scanWorker.setLocalWork(localItems, 1, 1);
mikepapadim commented 5 months ago

Thank you @ShAlireza, I managed to reprorduce it locally. We will work on the issue and we will get back to you.