inducer / loopy

A code generator for array-based code on CPUs and GPUs
http://mathema.tician.de/software/loopy
MIT License
580 stars 70 forks source link

[bug] Incorrect codegen for global temporaries with base storages #737

Closed kaushikcfd closed 1 year ago

kaushikcfd commented 1 year ago

Consider the kernel:

import loopy as lp

knl = lp.make_kernel(
    "{[r0, r1]: 0<=r0,r1<10}",
    """
    tmp0 = sum(r0, r0**2)
    ... gbarrier
    tmp1 = sum(r1, r1**3)
    ... gbarrier
    out = tmp0 + tmp1
    """,
    [lp.TemporaryVariable("tmp0",
                          shape=lp.auto,
                          address_space=lp.AddressSpace.GLOBAL,
                          base_storage="base1"),
     lp.TemporaryVariable("tmp1",
                          shape=lp.auto,
                          address_space=lp.AddressSpace.GLOBAL,
                          base_storage="base2"),
     ...],
    seq_dependencies=True
)

print(lp.generate_code_v2(knl).device_code())

and this generates:

/// .. <Some code above>

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global int *__restrict__ base1)
{
  int acc_r0;
  __global int *const __restrict__ tmp0 = (__global int *const __restrict__ ) (base1 + 0);
  __global int *const __restrict__ tmp1 = (__global int *const __restrict__ ) (base2 + 0);

  acc_r0 = 0;
  for (int r0 = 0; r0 <= 9; ++r0)
    acc_r0 = acc_r0 + r0 * r0;
  tmp0[0] = acc_r0;
}

__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel_0(__global int *__restrict__ base2)
{
  int acc_r1;
  __global int *const __restrict__ tmp0 = (__global int *const __restrict__ ) (base1 + 0);
  __global int *const __restrict__ tmp1 = (__global int *const __restrict__ ) (base2 + 0);

  acc_r1 = 0;
  for (int r1 = 0; r1 <= 9; ++r1)
    acc_r1 = acc_r1 + loopy_pow_int32_int32(r1, 3);
  tmp1[0] = acc_r1;
}

/// .. <Some more code below>

Clearly loopy_kernel_0 is illegal as base1 is not defined.