inducer / loopy

A code generator for array-based code on CPUs and GPUs
http://mathema.tician.de/software/loopy
MIT License
580 stars 70 forks source link

duplicaing an iname results in an unschedulable kernel #752

Open isuruf opened 1 year ago

isuruf commented 1 year ago

I'm not sure why it becomes unschedulable.

```python import loopy as lp import numpy as np from pymbolic.primitives import * import immutables e2p_from_single_box_knl = lp.make_kernel( [ "[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }", "{ [idim, idim_0] : 0 <= idim <= 2 and 0 <= idim_0 <= 2 }", "{ [itgt_offset_outer, itgt_offset_inner] : itgt_offset_inner >= 0 and -32itgt_offset_outer <= itgt_offset_inner <= 46 - 32itgt_offset_outer and itgt_offset_inner <= 31 }", "{ [icoeff_outer, icoeff_inner] : icoeff_inner >= 0 and -32icoeff_outer <= icoeff_inner <= 120 - 32icoeff_outer and icoeff_inner <= 31 }", "{ [iknl, iknl_0] : iknl = 0 and iknl_0 = 0 }", "{ [dummy] : 0 <= dummy <= 31 }", "[ntargets] -> { [] : ntargets > 0 }", "{ [e2p_idim] : 0 <= e2p_idim <= 2 }", "{ [e2p_iorder0] : 0 < e2p_iorder0 <= 10 }", "{ [e2p_zero_idx] : 1 = 0 }", "{ [e2p_icoeff_outer, e2p_icoeff_inner] : e2p_icoeff_inner >= 0 and -32e2p_icoeff_outer <= e2p_icoeff_inner <= 120 - 32e2p_icoeff_outer and e2p_icoeff_inner <= 31 }", "{ [e2p_x0] : 0 <= e2p_x0 <= 10 }", "[e2p_x0] -> { [e2p_iorder1] : e2p_x0 <= e2p_iorder1 <= 10 }", "[e2p_iorder1, e2p_x0] -> { [e2p_x2] : 0 <= e2p_x2 <= e2p_iorder1 - e2p_x0 }", "[e2p_iorder1, e2p_x0, e2p_x2] -> { [e2p_x1] : e2p_x1 = e2p_iorder1 - e2p_x0 - e2p_x2 }", "[e2p_x0] -> { [e2p_iorder2] : e2p_x0 <= e2p_iorder2 <= 10 }", "[e2p_iorder2, e2p_x0] -> { [e2p_y2] : 0 <= e2p_y2 <= e2p_iorder2 - e2p_x0 }", "[e2p_iorder2, e2p_x0, e2p_y2] -> { [e2p_y1] : e2p_y1 = e2p_iorder2 - e2p_x0 - e2p_y2 }", ], ''' kernel_scaling = (1 / 4)*3.141592653589793**(-1) {id=kernel_scaling, inames=+dummy:itgt_box} tgt_ibox = target_boxes[itgt_box] {id=fetch_init0, inames=dummy:itgt_box} itgt_start = box_target_starts[tgt_ibox] {id=fetch_init1, dep=fetch_init0, inames=dummy:itgt_box} itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=fetch_init2, dep=fetch_init0:fetch_init1, inames=dummy:itgt_box} center[idim] = centers[idim, tgt_ibox] {id=fetch_center, dep=fetch_init0, inames=dummy:itgt_box:idim} coeffs[icoeff_inner + icoeff_outer*32] = src_expansions[tgt_ibox + (-1)*src_base_ibox, icoeff_inner + icoeff_outer*32] {id=fetch_coeffs, dep=fetch_init0, inames=icoeff_outer:itgt_box:icoeff_inner} itgt = itgt_start + itgt_offset_inner + itgt_offset_outer*32 {id=insn, dep=fetch_init1, inames=itgt_offset_outer:itgt_offset_inner:itgt_box} run_itgt = itgt < itgt_end {id=insn_0, dep=fetch_init2:insn, inames=itgt_offset_outer:itgt_offset_inner:itgt_box} tgt[idim_0] = targets[idim_0, itgt] {id=fetch_tgt, dep=insn:insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:idim_0} result_temp[iknl_0] = 0 {id=init_result, dep=insn_0, inames=itgt_offset_outer:itgt_offset_inner:itgt_box:iknl_0} ... nop {id=e2p__start, dep=fetch_coeffs:fetch_tgt:insn_0:init_result:fetch_center:insn, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box} e2p_b[e2p_idim] = (tgt[e2p_idim] + (-1)*center[e2p_idim])*(1 / rscale) {id=e2p_set_b, dep=e2p__start, inames=itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box} e2p_power_b[e2p_idim, e2p_zero_idx] = 0 {id=e2p_zero_monomials, dep=e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_zero_idx:itgt_box} e2p_power_b[e2p_idim, 0] = 1 {id=e2p_init_monomials, dep=e2p__start:e2p_zero_monomials, inames=+itgt_offset_outer:itgt_offset_inner:e2p_idim:itgt_box} e2p_power_b[e2p_idim, e2p_iorder0] = e2p_power_b[e2p_idim, e2p_iorder0 + -1]*e2p_b[e2p_idim]*(1 / e2p_iorder0) {id=e2p_update_monomials, dep=e2p_set_b:e2p_init_monomials:e2p__start, inames=+itgt_offset_inner:e2p_idim:itgt_offset_outer:e2p_iorder0:itgt_box} e2p_coeffs_copy[e2p_icoeff_inner + e2p_icoeff_outer*32] = coeffs[e2p_icoeff_inner + e2p_icoeff_outer*32] {id=e2p_copy_coeffs, dep=e2p__start, inames=+e2p_icoeff_outer:e2p_icoeff_inner:itgt_box:itgt_offset_outer} e2p_coeffs_copy[((e2p_x0 % 2 + e2p_x1 + e2p_x2)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 1)*(e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)) // 6 + (e2p_x0 % 2 + e2p_x1 + e2p_x2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_x1 if e2p_x0 % 2 + e2p_x1 + e2p_x2 < 1 else (2*(e2p_x0 % 2 + e2p_x1 + e2p_x2)*(2 + e2p_x0 % 2 + e2p_x1 + e2p_x2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_x1 + e2p_x2))) // 2 + e2p_x1] = e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 + 2 if (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + 2 + e2p_x2))) // 2 + e2p_x1 + 2]*(-1.0) + e2p_coeffs_copy[(((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 1)*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)) // 6 + ((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + 2)*((e2p_x0 + -2) % 2) + (-1)*((((e2p_x0 + -2) % 2)*((e2p_x0 + -2) % 2 + 1)) // 2) + e2p_x1 if (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 < 1 else (2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2)*(2 + (e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2 + -2) + ((e2p_x0 + -2) % 2)*(3 + (-1)*((e2p_x0 + -2) % 2) + 2*((e2p_x0 + -2) % 2 + e2p_x1 + e2p_x2 + 2))) // 2 + e2p_x1]*(-1.0) {id=e2p_update_coeffs, dep=e2p__start:e2p_copy_coeffs, inames=+e2p_x2:e2p_iorder1:itgt_offset_outer:e2p_x0:e2p_x1:itgt_box} result_temp[0] = result_temp[0] + e2p_coeffs_copy[((e2p_x0 % 2 + e2p_y1 + e2p_y2)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 1)*(e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)) // 6 + (e2p_x0 % 2 + e2p_y1 + e2p_y2 + 2)*(e2p_x0 % 2) + (-1)*(((e2p_x0 % 2)*(e2p_x0 % 2 + 1)) // 2) + e2p_y1 if e2p_x0 % 2 + e2p_y1 + e2p_y2 < 1 else (2*(e2p_x0 % 2 + e2p_y1 + e2p_y2)*(2 + e2p_x0 % 2 + e2p_y1 + e2p_y2 + -2) + (e2p_x0 % 2)*(3 + (-1)*(e2p_x0 % 2) + 2*(e2p_x0 % 2 + e2p_y1 + e2p_y2))) // 2 + e2p_y1]*e2p_power_b[0, e2p_x0]*e2p_power_b[1, e2p_y1]*e2p_power_b[2, e2p_y2] {id=e2p_write_0, dep=e2p_update_monomials:e2p_update_coeffs:e2p__start, inames=+itgt_offset_inner:itgt_offset_outer:e2p_iorder2:e2p_x0:e2p_y1:e2p_y2:itgt_box} ... nop {id=update_result, dep=e2p_write_0:e2p_update_monomials:e2p_zero_monomials:e2p_update_coeffs:e2p_set_b:e2p_init_monomials:e2p_copy_coeffs, inames=+itgt_offset_outer:itgt_offset_inner:itgt_box} result[iknl, itgt] = result_temp[iknl]*kernel_scaling {id=write_result, dep=update_result:insn:insn_0:kernel_scaling, inames=iknl:itgt_offset_inner:itgt_box:itgt_offset_outer} ''', [ lp.GlobalArg( name="targets", dtype=None, shape=(3, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="box_target_starts", dtype=None, shape=None, for_atomic=False), lp.GlobalArg( name="box_target_counts_nonchild", dtype=None, shape=None, for_atomic=False), lp.GlobalArg( name="centers", dtype=None, shape=(3, Variable('naligned_boxes')), for_atomic=False), lp.ValueArg( name="rscale", dtype=None), lp.GlobalArg( name="result", dtype=None, shape=(1, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="src_expansions", dtype=None, shape=(Variable('nsrc_level_boxes'), 121), for_atomic=False), lp.ValueArg( name="nsrc_level_boxes", dtype=np.int32), lp.ValueArg( name="naligned_boxes", dtype=np.int32), lp.ValueArg( name="src_base_ibox", dtype=np.int32), lp.ValueArg( name="ntargets", dtype=np.int32), lp.ValueArg( name="ntgt_boxes", dtype=None), lp.GlobalArg( name="target_boxes", dtype=None, shape=(Variable('ntgt_boxes'),), for_atomic=False), lp.TemporaryVariable( name="kernel_scaling", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="tgt_ibox", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="itgt_start", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="itgt_end", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="center", shape=(3,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="coeffs", shape=(121,), for_atomic=False, address_space=lp.AddressSpace.LOCAL, read_only=False, ), lp.TemporaryVariable( name="itgt", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="run_itgt", shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="tgt", shape=(3,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="result_temp", shape=(1,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_b", shape=(3,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_power_b", shape=(3, 11), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_coeffs_copy", shape=(121,), for_atomic=False, address_space=lp.AddressSpace.LOCAL, read_only=False, ), ], lang_version=(2018, 2), iname_slab_increments=immutables.Map({'itgt_offset_outer': (0, 0), 'e2p_icoeff_outer': (0, 0), 'icoeff_outer': (0, 0)}), applied_iname_rewrites=({Variable('itgt_offset'): Sum((Variable('itgt_offset_inner'), Product((Variable('itgt_offset_outer'), 32))))}, {Variable('icoeff'): Sum((Variable('icoeff_inner'), Product((Variable('icoeff_outer'), 32))))}, {Variable('e2p_icoeff'): Sum((Variable('e2p_icoeff_inner'), Product((Variable('e2p_icoeff_outer'), 32))))}), name="e2p_from_single_box", ) e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim_0:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_offset_inner:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "idim:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_idim:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl_0:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder1:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_icoeff_inner:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder0:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "icoeff_inner:l.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "e2p_iorder2:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "iknl:unr") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "itgt_box:g.0") e2p_from_single_box_knl = lp.tag_inames(e2p_from_single_box_knl, "dummy:l.0") knl = lp.merge([e2p_from_single_box_knl]) knl = lp.add_and_infer_dtypes(knl, {"targets": np.float64, "box_target_starts": np.int32, "box_target_counts_nonchild": np.int32, "target_boxes": np.int32, "centers": np.float64, "rscale": np.float64, "result": np.float64, "src_expansions": np.float64}) print(lp.generate_code_v2(knl).device_code()) knl = lp.split_iname(knl, "e2p_x0", 2) knl = lp.duplicate_inames(knl, "e2p_x0_inner", within="id:e2p_update_coeffs") print(lp.generate_code_v2(knl).device_code()) ```
kaushikcfd commented 1 year ago

I did not look closely at the provided kernel, but that can happen in the following case:

import loopy as lp

knl = lp.make_kernel(
    "{[i, j]: 0<=i,j<10}",
    """
    for i
        <> tmp = 10
        for j
            out1[i, j] = i*j*tmp
            out2[i, j] = (i+j)*tmp
        end
    end
    """)

lp.generate_code_v2(knl)  # generates code
knl = lp.duplicate_inames(knl, "i", within="writes:out1")
lp.generate_code_v2(knl)  # FAILS due to unschedulable loop nesting

I would not consider the above behavior a loopy bug as it simply did what the user demanded.

inducer commented 1 year ago

Agree with @kaushikcfd. Good to close?