inducer / loopy

A code generator for array-based code on CPUs and GPUs
http://mathema.tician.de/software/loopy
MIT License
565 stars 71 forks source link

check_implemented_domains still fails #770

Closed isuruf closed 1 year ago

isuruf commented 1 year ago

Following kernel still fails with

LoopyError: sanity check failed--implemented and desired domain for instruction 'prefetch_insn2' do not match

implemented: [ntgt_boxes, isrc_box_end, isrc_box_start] -> { [itgt_box, isrc_box, inner, itgt_offset_outer = 0, iprefetch = 0, isrc_prefetch_inner = 0] : 0 <= itgt_box < ntgt_boxes and isrc_box_start <= isrc_box < isrc_box_end }

desired:[ntgt_boxes, isrc_box_end, isrc_box_start] -> { [itgt_box, isrc_box, inner, itgt_offset_outer, iprefetch, isrc_prefetch_inner] : itgt_offset_outer = 0 and iprefetch = 0 and isrc_prefetch_inner = 0 and 0 <= itgt_box < ntgt_boxes and isrc_box_start <= isrc_box < isrc_box_end and 0 <= inner <= 25 }

sample point in implemented but not desired: isrc_prefetch_inner=0, iprefetch=0, isrc_box_start=-1, isrc_box_end=0, ntgt_boxes=1, inner=-1, itgt_box=0, isrc_box=-1, itgt_offset_outer=0
gist of constraints in implemented but not desired: [ntgt_boxes, isrc_box_end, isrc_box_start] -> { [itgt_box, isrc_box, inner, itgt_offset_outer, iprefetch, isrc_prefetch_inner] : 0 <= inner <= 25 }
```python import loopy as lp import numpy as np from pymbolic.primitives import * import immutables p2p_knl = lp.make_kernel( [ "[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }", "{ [iknl] : iknl = 0 }", "[isrc_box_end, isrc_box_start] -> { [isrc_box] : isrc_box_start <= isrc_box < isrc_box_end }", "{ [idim, idim_0, idim_1] : 0 <= idim <= 1 and 0 <= idim_0 <= 1 and 0 <= idim_1 <= 1 }", "{ [istrength] : istrength = 0 }", "{ [inner] : 0 <= inner <= 31 }", "{ [itgt_offset_outer] : itgt_offset_outer = 0 }", "{ [iprefetch] : iprefetch = 0 }", "[inner] -> { [isrc_prefetch_inner] : isrc_prefetch_inner = 0 and 0 <= inner <= 25 }", "[iprefetch, isrc_end, isrc_start] -> { [isrc_offset] : isrc_offset >= 0 and -26iprefetch <= isrc_offset < -26iprefetch + isrc_end - isrc_start and isrc_offset <= 25 }", ], ''' knl_0_scaling = (1 / 8)*3.141592653589793**(-1) {id=insn, inames=+inner:itgt_box} tgt_ibox = target_boxes[itgt_box] {id=insn_0, inames=inner:itgt_box} itgt_start = box_target_starts[tgt_ibox] {id=insn_1, dep=insn_0, inames=inner:itgt_box} itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=insn_2, dep=insn_0:insn_1, inames=inner:itgt_box} isrc_box_start = source_box_starts[itgt_box] {id=insn_3, inames=inner:itgt_box} isrc_box_end = source_box_starts[itgt_box + 1] {id=insn_4, inames=inner:itgt_box} itgt_offset = itgt_offset_outer*32 + inner {id=insn_5, inames=inner:itgt_offset_outer:itgt_box} itgt = itgt_offset + itgt_start {id=insn_6, dep=insn_5:insn_1, inames=inner:itgt_offset_outer:itgt_box} cond_itgt = itgt < itgt_end {id=insn_7, dep=insn_2:insn_6, inames=inner:itgt_offset_outer:itgt_box} acc[iknl] = 0 {id=init_acc, inames=iknl:inner:itgt_offset_outer:itgt_box} tgt_center[idim_0] = targets[idim_0, itgt] {id=prefetch_tgt, dep=insn_7:insn_6, inames=inner:itgt_offset_outer:idim_0:itgt_box} src_ibox = source_box_lists[isrc_box] {id=src_box_insn_0, inames=inner:itgt_offset_outer:isrc_box:itgt_box} isrc_start = box_source_starts[src_ibox] {id=src_box_insn_1, dep=src_box_insn_0, inames=inner:itgt_offset_outer:isrc_box:itgt_box} isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] {id=src_box_insn_2, dep=src_box_insn_1:src_box_insn_0, inames=inner:itgt_offset_outer:isrc_box:itgt_box} isrc_prefetch_new = isrc_prefetch_inner*32 + inner {id=prefetch_insn1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:isrc_box:itgt_box} isrc_prefetch = iprefetch*26 + isrc_prefetch_inner*32 + inner {id=prefetch_insn2, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:isrc_box:itgt_box} cond_isrc = isrc_prefetch < isrc_end + (-1)*isrc_start {id=prefetch_insn3, dep=prefetch_insn2:src_box_insn_2:src_box_insn_1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:isrc_box:itgt_box} local_isrc[idim_1, isrc_prefetch_new] = sources[idim_1, isrc_prefetch + isrc_start] {id=prefetch_src, dep=prefetch_insn3:prefetch_insn2:prefetch_insn1:src_box_insn_1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:idim_1:isrc_box:itgt_box} local_isrc[istrength + 2, isrc_prefetch_new] = strength[istrength, isrc_prefetch + isrc_start] {id=prefetch_charge, dep=prefetch_insn3:prefetch_insn2:prefetch_insn1:src_box_insn_1, inames=itgt_offset_outer:iprefetch:isrc_prefetch_inner:inner:istrength:isrc_box:itgt_box} isrc = isrc_offset + iprefetch*26 + isrc_start {id=insn_8, dep=insn_7:src_box_insn_1, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box} d[idim] = tgt_center[idim] + (-1)*local_isrc[idim, isrc_offset] {id=insn_9, dep=prefetch_src:insn_7:prefetch_tgt, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:idim:isrc_box:itgt_box} strength_0 = local_isrc[2, isrc_offset] {id=insn_10, dep=insn_7:prefetch_charge, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box} expr = d[0]*d[0] + d[1]*d[1] {id=insn_11, dep=insn_7:insn_9, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box} cse_exprvar = sqrt(expr) {id=insn_12, dep=insn_7:insn_11, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box} pair_result_0 = expr*log(cse_exprvar)*strength_0 {id=insn_13, dep=insn_11:insn_12:insn_10:insn_7, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box} acc[0] = acc[0] + pair_result_0 {id=update_acc_0, dep=insn_7:init_acc:insn_13, inames=itgt_offset_outer:isrc_offset:iprefetch:inner:isrc_box:itgt_box} result[0, itgt] = knl_0_scaling*acc[0] {id=write_csr, dep=insn_7:update_acc_0:insn_6:insn, inames=inner:itgt_offset_outer:itgt_box} ''', [ lp.GlobalArg( name="sources", dtype=np.float64, shape=(2, Variable('nsources')), for_atomic=False), lp.GlobalArg( name="sources_s0", dtype=np.float64, shape=(Variable('nsources'),), for_atomic=False), lp.GlobalArg( name="sources_s1", dtype=np.float64, shape=(Variable('nsources'),), for_atomic=False), lp.GlobalArg( name="targets", dtype=np.float64, shape=(2, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="targets_s0", dtype=np.float64, shape=(Variable('ntargets'),), for_atomic=False), lp.GlobalArg( name="targets_s1", dtype=np.float64, shape=(Variable('ntargets'),), for_atomic=False), lp.ValueArg( name="nsources", dtype=np.int32), lp.ValueArg( name="ntargets", dtype=np.int32), lp.GlobalArg( name="box_target_starts", dtype=np.int32, shape=None, for_atomic=False), lp.GlobalArg( name="box_target_counts_nonchild", dtype=np.int32, shape=None, for_atomic=False), lp.GlobalArg( name="box_source_starts", dtype=np.int32, shape=None, for_atomic=False), lp.GlobalArg( name="box_source_counts_nonchild", dtype=np.int32, shape=None, for_atomic=False), lp.GlobalArg( name="source_box_starts", dtype=np.int32, shape=None, for_atomic=False), lp.GlobalArg( name="source_box_lists", dtype=np.int32, shape=None, for_atomic=False), lp.GlobalArg( name="strength", dtype=np.float64, shape=(1, Variable('nsources')), for_atomic=False), lp.GlobalArg( name="strength_s0", dtype=np.float64, shape=(Variable('nsources'),), for_atomic=False), lp.GlobalArg( name="result", dtype=np.float64, shape=(1, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="result_s0", dtype=np.float64, shape=(Variable('ntargets'),), for_atomic=False), lp.ValueArg( name="ntgt_boxes", dtype=np.int32), lp.GlobalArg( name="target_boxes", dtype=np.int32, shape=(Variable('ntgt_boxes'),), for_atomic=False), lp.TemporaryVariable( name="tgt_center", dtype=np.float64, shape=(2,), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="cse_exprvar", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="knl_0_scaling", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="tgt_ibox", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="itgt_start", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="itgt_end", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="isrc_box_start", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="isrc_box_end", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="itgt_offset", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="itgt", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="cond_itgt", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="acc", dtype=np.float64, shape=(1,), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="src_ibox", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="isrc_start", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="isrc_end", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="isrc_prefetch_new", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="isrc_prefetch", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="cond_isrc", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="isrc", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="d", dtype=np.float64, shape=(2,), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="strength_0", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="expr", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="pair_result_0", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.AddressSpace.PRIVATE, read_only=False, ), lp.TemporaryVariable( name="local_isrc", dtype=np.float64, shape=(3, 26), for_atomic=False, address_space=lp.AddressSpace.LOCAL, read_only=False, ), ], lang_version=(2018, 2), name="p2p", ) p2p_knl = lp.tag_inames(p2p_knl, "istrength:unr") p2p_knl = lp.tag_inames(p2p_knl, "idim_1:unr") p2p_knl = lp.tag_inames(p2p_knl, "idim_0:unr") p2p_knl = lp.tag_inames(p2p_knl, "itgt_box:g.0") p2p_knl = lp.tag_inames(p2p_knl, "inner:l.0") p2p_knl = lp.tag_inames(p2p_knl, "idim:unr") t_unit = lp.merge([p2p_knl]) lp.generate_code_v2(t_unit).device_code() ```
inducer commented 1 year ago

775 should address this.